vortex_btrblocks/compressor/integer/
stats.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::hash::Hash;
5
6use num_traits::PrimInt;
7use rustc_hash::FxBuildHasher;
8use vortex_array::ToCanonical;
9use vortex_array::arrays::NativeValue;
10use vortex_array::arrays::PrimitiveArray;
11use vortex_array::arrays::PrimitiveVTable;
12use vortex_array::dtype::IntegerPType;
13use vortex_array::expr::stats::Stat;
14use vortex_array::match_each_integer_ptype;
15use vortex_array::scalar::PValue;
16use vortex_array::scalar::Scalar;
17use vortex_buffer::BitBuffer;
18use vortex_error::VortexError;
19use vortex_error::VortexExpect;
20use vortex_error::VortexResult;
21use vortex_mask::AllOr;
22use vortex_utils::aliases::hash_map::HashMap;
23
24use crate::CompressorStats;
25use crate::GenerateStatsOptions;
26use crate::compressor::rle::RLEStats;
27use crate::sample::sample;
28
29#[derive(Clone, Debug)]
30pub struct TypedStats<T> {
31    pub min: T,
32    pub max: T,
33    pub top_value: T,
34    pub top_count: u32,
35    pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
36}
37
38/// Type-erased container for one of the [TypedStats] variants.
39///
40/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
41/// set of stats. We then perform a variety of access methods on them.
42#[derive(Clone, Debug)]
43pub enum ErasedStats {
44    U8(TypedStats<u8>),
45    U16(TypedStats<u16>),
46    U32(TypedStats<u32>),
47    U64(TypedStats<u64>),
48    I8(TypedStats<i8>),
49    I16(TypedStats<i16>),
50    I32(TypedStats<i32>),
51    I64(TypedStats<i64>),
52}
53
54impl ErasedStats {
55    pub fn min_is_zero(&self) -> bool {
56        match &self {
57            ErasedStats::U8(x) => x.min == 0,
58            ErasedStats::U16(x) => x.min == 0,
59            ErasedStats::U32(x) => x.min == 0,
60            ErasedStats::U64(x) => x.min == 0,
61            ErasedStats::I8(x) => x.min == 0,
62            ErasedStats::I16(x) => x.min == 0,
63            ErasedStats::I32(x) => x.min == 0,
64            ErasedStats::I64(x) => x.min == 0,
65        }
66    }
67
68    pub fn min_is_negative(&self) -> bool {
69        match &self {
70            ErasedStats::U8(_)
71            | ErasedStats::U16(_)
72            | ErasedStats::U32(_)
73            | ErasedStats::U64(_) => false,
74            ErasedStats::I8(x) => x.min < 0,
75            ErasedStats::I16(x) => x.min < 0,
76            ErasedStats::I32(x) => x.min < 0,
77            ErasedStats::I64(x) => x.min < 0,
78        }
79    }
80
81    // Difference between max and min.
82    pub fn max_minus_min(&self) -> u64 {
83        match &self {
84            ErasedStats::U8(x) => (x.max - x.min) as u64,
85            ErasedStats::U16(x) => (x.max - x.min) as u64,
86            ErasedStats::U32(x) => (x.max - x.min) as u64,
87            ErasedStats::U64(x) => x.max - x.min,
88            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
89            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
90            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
91            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
92                .vortex_expect("max minus min result bigger than u64"),
93        }
94    }
95
96    /// Returns the ilog2 of the max value when transmuted to unsigned, or None if zero.
97    ///
98    /// This matches how BitPacking computes bit width: it reinterprets signed values as
99    /// unsigned (preserving bit pattern) and uses leading_zeros. For non-negative signed
100    /// values, the transmuted value equals the original value.
101    ///
102    /// This is used to determine if FOR encoding would reduce bit width compared to
103    /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help.
104    pub fn max_ilog2(&self) -> Option<u32> {
105        match &self {
106            ErasedStats::U8(x) => x.max.checked_ilog2(),
107            ErasedStats::U16(x) => x.max.checked_ilog2(),
108            ErasedStats::U32(x) => x.max.checked_ilog2(),
109            ErasedStats::U64(x) => x.max.checked_ilog2(),
110            // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior
111            ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
112            ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
113            ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
114            ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
115        }
116    }
117
118    /// Get the most commonly occurring value and its count
119    pub fn top_value_and_count(&self) -> (PValue, u32) {
120        match &self {
121            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
122            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
123            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
124            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
125            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
126            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
127            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
128            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
129        }
130    }
131}
132
133macro_rules! impl_from_typed {
134    ($T:ty, $variant:path) => {
135        impl From<TypedStats<$T>> for ErasedStats {
136            fn from(typed: TypedStats<$T>) -> Self {
137                $variant(typed)
138            }
139        }
140    };
141}
142
143impl_from_typed!(u8, ErasedStats::U8);
144impl_from_typed!(u16, ErasedStats::U16);
145impl_from_typed!(u32, ErasedStats::U32);
146impl_from_typed!(u64, ErasedStats::U64);
147impl_from_typed!(i8, ErasedStats::I8);
148impl_from_typed!(i16, ErasedStats::I16);
149impl_from_typed!(i32, ErasedStats::I32);
150impl_from_typed!(i64, ErasedStats::I64);
151
152/// Array of integers and relevant stats for compression.
153#[derive(Clone, Debug)]
154pub struct IntegerStats {
155    pub(super) src: PrimitiveArray,
156    // cache for validity.false_count()
157    pub(super) null_count: u32,
158    // cache for validity.true_count()
159    pub(super) value_count: u32,
160    pub(super) average_run_length: u32,
161    pub(super) distinct_values_count: u32,
162    pub(crate) typed: ErasedStats,
163}
164
165impl IntegerStats {
166    fn generate_opts_fallible(
167        input: &PrimitiveArray,
168        opts: GenerateStatsOptions,
169    ) -> VortexResult<Self> {
170        match_each_integer_ptype!(input.ptype(), |T| {
171            typed_int_stats::<T>(input, opts.count_distinct_values)
172        })
173    }
174}
175
176impl CompressorStats for IntegerStats {
177    type ArrayVTable = PrimitiveVTable;
178
179    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
180        Self::generate_opts_fallible(input, opts)
181            .vortex_expect("IntegerStats::generate_opts should not fail")
182    }
183
184    fn source(&self) -> &PrimitiveArray {
185        &self.src
186    }
187
188    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
189        let sampled = sample(self.src.as_ref(), sample_size, sample_count).to_primitive();
190
191        Self::generate_opts(&sampled, opts)
192    }
193}
194
195impl RLEStats for IntegerStats {
196    fn value_count(&self) -> u32 {
197        self.value_count
198    }
199
200    fn average_run_length(&self) -> u32 {
201        self.average_run_length
202    }
203
204    fn source(&self) -> &PrimitiveArray {
205        &self.src
206    }
207}
208
209fn typed_int_stats<T>(
210    array: &PrimitiveArray,
211    count_distinct_values: bool,
212) -> VortexResult<IntegerStats>
213where
214    T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
215    TypedStats<T>: Into<ErasedStats>,
216    NativeValue<T>: Eq + Hash,
217{
218    // Special case: empty array
219    if array.is_empty() {
220        return Ok(IntegerStats {
221            src: array.clone(),
222            null_count: 0,
223            value_count: 0,
224            average_run_length: 0,
225            distinct_values_count: 0,
226            typed: TypedStats {
227                min: T::max_value(),
228                max: T::min_value(),
229                top_value: T::default(),
230                top_count: 0,
231                distinct_values: HashMap::with_hasher(FxBuildHasher),
232            }
233            .into(),
234        });
235    } else if array.all_invalid()? {
236        return Ok(IntegerStats {
237            src: array.clone(),
238            null_count: u32::try_from(array.len())?,
239            value_count: 0,
240            average_run_length: 0,
241            distinct_values_count: 0,
242            typed: TypedStats {
243                min: T::max_value(),
244                max: T::min_value(),
245                top_value: T::default(),
246                top_count: 0,
247                distinct_values: HashMap::with_hasher(FxBuildHasher),
248            }
249            .into(),
250        });
251    }
252
253    let validity = array.validity_mask()?;
254    let null_count = validity.false_count();
255    let value_count = validity.true_count();
256
257    // Initialize loop state
258    let head_idx = validity
259        .first()
260        .vortex_expect("All null masks have been handled before");
261    let buffer = array.to_buffer::<T>();
262    let head = buffer[head_idx];
263
264    let mut loop_state = LoopState {
265        distinct_values: if count_distinct_values {
266            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
267        } else {
268            HashMap::with_hasher(FxBuildHasher)
269        },
270        prev: head,
271        runs: 1,
272    };
273
274    let sliced = buffer.slice(head_idx..array.len());
275    let mut chunks = sliced.as_slice().chunks_exact(64);
276    match validity.bit_buffer() {
277        AllOr::All => {
278            for chunk in &mut chunks {
279                inner_loop_nonnull(
280                    chunk.try_into().ok().vortex_expect("chunk size must be 64"),
281                    count_distinct_values,
282                    &mut loop_state,
283                )
284            }
285            let remainder = chunks.remainder();
286            inner_loop_naive(
287                remainder,
288                count_distinct_values,
289                &BitBuffer::new_set(remainder.len()),
290                &mut loop_state,
291            );
292        }
293        AllOr::None => unreachable!("All invalid arrays have been handled before"),
294        AllOr::Some(v) => {
295            let mask = v.slice(head_idx..array.len());
296            let mut offset = 0;
297            for chunk in &mut chunks {
298                let validity = mask.slice(offset..(offset + 64));
299                offset += 64;
300
301                match validity.true_count() {
302                    // All nulls -> no stats to update
303                    0 => continue,
304                    // Inner loop for when validity check can be elided
305                    64 => inner_loop_nonnull(
306                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
307                        count_distinct_values,
308                        &mut loop_state,
309                    ),
310                    // Inner loop for when we need to check validity
311                    _ => inner_loop_nullable(
312                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
313                        count_distinct_values,
314                        &validity,
315                        &mut loop_state,
316                    ),
317                }
318            }
319            // Final iteration, run naive loop
320            let remainder = chunks.remainder();
321            inner_loop_naive(
322                remainder,
323                count_distinct_values,
324                &mask.slice(offset..(offset + remainder.len())),
325                &mut loop_state,
326            );
327        }
328    }
329
330    let (top_value, top_count) = if count_distinct_values {
331        let (&top_value, &top_count) = loop_state
332            .distinct_values
333            .iter()
334            .max_by_key(|&(_, &count)| count)
335            .vortex_expect("non-empty");
336        (top_value.0, top_count)
337    } else {
338        (T::default(), 0)
339    };
340
341    let runs = loop_state.runs;
342    let distinct_values_count = if count_distinct_values {
343        u32::try_from(loop_state.distinct_values.len())?
344    } else {
345        u32::MAX
346    };
347
348    let min = array
349        .statistics()
350        .compute_as::<T>(Stat::Min)
351        .vortex_expect("min should be computed");
352
353    let max = array
354        .statistics()
355        .compute_as::<T>(Stat::Max)
356        .vortex_expect("max should be computed");
357
358    let typed = TypedStats {
359        min,
360        max,
361        distinct_values: loop_state.distinct_values,
362        top_value,
363        top_count,
364    };
365
366    let null_count = u32::try_from(null_count)?;
367    let value_count = u32::try_from(value_count)?;
368
369    Ok(IntegerStats {
370        src: array.clone(),
371        null_count,
372        value_count,
373        average_run_length: value_count / runs,
374        distinct_values_count,
375        typed: typed.into(),
376    })
377}
378
379struct LoopState<T> {
380    prev: T,
381    runs: u32,
382    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
383}
384
385#[inline(always)]
386fn inner_loop_nonnull<T: IntegerPType>(
387    values: &[T; 64],
388    count_distinct_values: bool,
389    state: &mut LoopState<T>,
390) where
391    NativeValue<T>: Eq + Hash,
392{
393    for &value in values {
394        if count_distinct_values {
395            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
396        }
397
398        if value != state.prev {
399            state.prev = value;
400            state.runs += 1;
401        }
402    }
403}
404
405#[inline(always)]
406fn inner_loop_nullable<T: IntegerPType>(
407    values: &[T; 64],
408    count_distinct_values: bool,
409    is_valid: &BitBuffer,
410    state: &mut LoopState<T>,
411) where
412    NativeValue<T>: Eq + Hash,
413{
414    for (idx, &value) in values.iter().enumerate() {
415        if is_valid.value(idx) {
416            if count_distinct_values {
417                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
418            }
419
420            if value != state.prev {
421                state.prev = value;
422                state.runs += 1;
423            }
424        }
425    }
426}
427
428#[inline(always)]
429fn inner_loop_naive<T: IntegerPType>(
430    values: &[T],
431    count_distinct_values: bool,
432    is_valid: &BitBuffer,
433    state: &mut LoopState<T>,
434) where
435    NativeValue<T>: Eq + Hash,
436{
437    for (idx, &value) in values.iter().enumerate() {
438        if is_valid.value(idx) {
439            if count_distinct_values {
440                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
441            }
442
443            if value != state.prev {
444                state.prev = value;
445                state.runs += 1;
446            }
447        }
448    }
449}
450
451#[cfg(test)]
452mod tests {
453    use std::iter;
454
455    use vortex_array::arrays::PrimitiveArray;
456    use vortex_array::validity::Validity;
457    use vortex_buffer::BitBuffer;
458    use vortex_buffer::Buffer;
459    use vortex_buffer::buffer;
460    use vortex_error::VortexResult;
461
462    use super::IntegerStats;
463    use super::typed_int_stats;
464    use crate::CompressorStats;
465
466    #[test]
467    fn test_naive_count_distinct_values() -> VortexResult<()> {
468        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
469        let stats = typed_int_stats::<u8>(&array, true)?;
470        assert_eq!(stats.distinct_values_count, 2);
471        Ok(())
472    }
473
474    #[test]
475    fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
476        let array = PrimitiveArray::new(
477            buffer![217u8, 0],
478            Validity::from(BitBuffer::from(vec![true, false])),
479        );
480        let stats = typed_int_stats::<u8>(&array, true)?;
481        assert_eq!(stats.distinct_values_count, 1);
482        Ok(())
483    }
484
485    #[test]
486    fn test_count_distinct_values() -> VortexResult<()> {
487        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
488        let stats = typed_int_stats::<u8>(&array, true)?;
489        assert_eq!(stats.distinct_values_count, 128);
490        Ok(())
491    }
492
493    #[test]
494    fn test_count_distinct_values_nullable() -> VortexResult<()> {
495        let array = PrimitiveArray::new(
496            (0..128u8).collect::<Buffer<u8>>(),
497            Validity::from(BitBuffer::from_iter(
498                iter::repeat_n(vec![true, false], 64).flatten(),
499            )),
500        );
501        let stats = typed_int_stats::<u8>(&array, true)?;
502        assert_eq!(stats.distinct_values_count, 64);
503        Ok(())
504    }
505
506    #[test]
507    fn test_integer_stats_leading_nulls() {
508        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
509
510        let stats = IntegerStats::generate(&ints);
511
512        assert_eq!(stats.value_count, 2);
513        assert_eq!(stats.null_count, 1);
514        assert_eq!(stats.average_run_length, 1);
515        assert_eq!(stats.distinct_values_count, 2);
516    }
517}
vortex_btrblocks/compressor/integer/stats.rs

vortex_btrblocks/compressor/integer/
stats.rs