Skip to main content

vortex_btrblocks/compressor/integer/
stats.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::hash::Hash;
5
6use num_traits::PrimInt;
7use rustc_hash::FxBuildHasher;
8use vortex_array::IntoArray;
9use vortex_array::ToCanonical;
10use vortex_array::arrays::Primitive;
11use vortex_array::arrays::PrimitiveArray;
12use vortex_array::arrays::primitive::NativeValue;
13use vortex_array::dtype::IntegerPType;
14use vortex_array::expr::stats::Stat;
15use vortex_array::match_each_integer_ptype;
16use vortex_array::scalar::PValue;
17use vortex_array::scalar::Scalar;
18use vortex_buffer::BitBuffer;
19use vortex_error::VortexError;
20use vortex_error::VortexExpect;
21use vortex_error::VortexResult;
22use vortex_mask::AllOr;
23use vortex_utils::aliases::hash_map::HashMap;
24
25use crate::CompressorStats;
26use crate::GenerateStatsOptions;
27use crate::compressor::rle::RLEStats;
28use crate::sample::sample;
29
30#[derive(Clone, Debug)]
31pub struct TypedStats<T> {
32    pub min: T,
33    pub max: T,
34    pub top_value: T,
35    pub top_count: u32,
36    pub distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
37}
38
39/// Type-erased container for one of the [TypedStats] variants.
40///
41/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
42/// set of stats. We then perform a variety of access methods on them.
43#[derive(Clone, Debug)]
44pub enum ErasedStats {
45    U8(TypedStats<u8>),
46    U16(TypedStats<u16>),
47    U32(TypedStats<u32>),
48    U64(TypedStats<u64>),
49    I8(TypedStats<i8>),
50    I16(TypedStats<i16>),
51    I32(TypedStats<i32>),
52    I64(TypedStats<i64>),
53}
54
55impl ErasedStats {
56    pub fn min_is_zero(&self) -> bool {
57        match &self {
58            ErasedStats::U8(x) => x.min == 0,
59            ErasedStats::U16(x) => x.min == 0,
60            ErasedStats::U32(x) => x.min == 0,
61            ErasedStats::U64(x) => x.min == 0,
62            ErasedStats::I8(x) => x.min == 0,
63            ErasedStats::I16(x) => x.min == 0,
64            ErasedStats::I32(x) => x.min == 0,
65            ErasedStats::I64(x) => x.min == 0,
66        }
67    }
68
69    pub fn min_is_negative(&self) -> bool {
70        match &self {
71            ErasedStats::U8(_)
72            | ErasedStats::U16(_)
73            | ErasedStats::U32(_)
74            | ErasedStats::U64(_) => false,
75            ErasedStats::I8(x) => x.min < 0,
76            ErasedStats::I16(x) => x.min < 0,
77            ErasedStats::I32(x) => x.min < 0,
78            ErasedStats::I64(x) => x.min < 0,
79        }
80    }
81
82    // Difference between max and min.
83    pub fn max_minus_min(&self) -> u64 {
84        match &self {
85            ErasedStats::U8(x) => (x.max - x.min) as u64,
86            ErasedStats::U16(x) => (x.max - x.min) as u64,
87            ErasedStats::U32(x) => (x.max - x.min) as u64,
88            ErasedStats::U64(x) => x.max - x.min,
89            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
90            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
91            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
92            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
93                .vortex_expect("max minus min result bigger than u64"),
94        }
95    }
96
97    /// Returns the ilog2 of the max value when transmuted to unsigned, or None if zero.
98    ///
99    /// This matches how BitPacking computes bit width: it reinterprets signed values as
100    /// unsigned (preserving bit pattern) and uses leading_zeros. For non-negative signed
101    /// values, the transmuted value equals the original value.
102    ///
103    /// This is used to determine if FOR encoding would reduce bit width compared to
104    /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help.
105    pub fn max_ilog2(&self) -> Option<u32> {
106        match &self {
107            ErasedStats::U8(x) => x.max.checked_ilog2(),
108            ErasedStats::U16(x) => x.max.checked_ilog2(),
109            ErasedStats::U32(x) => x.max.checked_ilog2(),
110            ErasedStats::U64(x) => x.max.checked_ilog2(),
111            // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior
112            ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
113            ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
114            ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
115            ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
116        }
117    }
118
119    /// Get the most commonly occurring value and its count
120    pub fn top_value_and_count(&self) -> (PValue, u32) {
121        match &self {
122            ErasedStats::U8(x) => (x.top_value.into(), x.top_count),
123            ErasedStats::U16(x) => (x.top_value.into(), x.top_count),
124            ErasedStats::U32(x) => (x.top_value.into(), x.top_count),
125            ErasedStats::U64(x) => (x.top_value.into(), x.top_count),
126            ErasedStats::I8(x) => (x.top_value.into(), x.top_count),
127            ErasedStats::I16(x) => (x.top_value.into(), x.top_count),
128            ErasedStats::I32(x) => (x.top_value.into(), x.top_count),
129            ErasedStats::I64(x) => (x.top_value.into(), x.top_count),
130        }
131    }
132}
133
134macro_rules! impl_from_typed {
135    ($T:ty, $variant:path) => {
136        impl From<TypedStats<$T>> for ErasedStats {
137            fn from(typed: TypedStats<$T>) -> Self {
138                $variant(typed)
139            }
140        }
141    };
142}
143
144impl_from_typed!(u8, ErasedStats::U8);
145impl_from_typed!(u16, ErasedStats::U16);
146impl_from_typed!(u32, ErasedStats::U32);
147impl_from_typed!(u64, ErasedStats::U64);
148impl_from_typed!(i8, ErasedStats::I8);
149impl_from_typed!(i16, ErasedStats::I16);
150impl_from_typed!(i32, ErasedStats::I32);
151impl_from_typed!(i64, ErasedStats::I64);
152
153/// Array of integers and relevant stats for compression.
154#[derive(Clone, Debug)]
155pub struct IntegerStats {
156    pub(super) src: PrimitiveArray,
157    // cache for validity.false_count()
158    pub(super) null_count: u32,
159    // cache for validity.true_count()
160    pub(super) value_count: u32,
161    pub(super) average_run_length: u32,
162    pub(super) distinct_values_count: u32,
163    pub(crate) typed: ErasedStats,
164}
165
166impl IntegerStats {
167    fn generate_opts_fallible(
168        input: &PrimitiveArray,
169        opts: GenerateStatsOptions,
170    ) -> VortexResult<Self> {
171        match_each_integer_ptype!(input.ptype(), |T| {
172            typed_int_stats::<T>(input, opts.count_distinct_values)
173        })
174    }
175}
176
177impl CompressorStats for IntegerStats {
178    type ArrayVTable = Primitive;
179
180    fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
181        Self::generate_opts_fallible(input, opts)
182            .vortex_expect("IntegerStats::generate_opts should not fail")
183    }
184
185    fn source(&self) -> &PrimitiveArray {
186        &self.src
187    }
188
189    fn sample_opts(&self, sample_size: u32, sample_count: u32, opts: GenerateStatsOptions) -> Self {
190        let sampled =
191            sample(&self.src.clone().into_array(), sample_size, sample_count).to_primitive();
192
193        Self::generate_opts(&sampled, opts)
194    }
195}
196
197impl RLEStats for IntegerStats {
198    fn value_count(&self) -> u32 {
199        self.value_count
200    }
201
202    fn average_run_length(&self) -> u32 {
203        self.average_run_length
204    }
205
206    fn source(&self) -> &PrimitiveArray {
207        &self.src
208    }
209}
210
211fn typed_int_stats<T>(
212    array: &PrimitiveArray,
213    count_distinct_values: bool,
214) -> VortexResult<IntegerStats>
215where
216    T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
217    TypedStats<T>: Into<ErasedStats>,
218    NativeValue<T>: Eq + Hash,
219{
220    // Special case: empty array
221    if array.is_empty() {
222        return Ok(IntegerStats {
223            src: array.clone(),
224            null_count: 0,
225            value_count: 0,
226            average_run_length: 0,
227            distinct_values_count: 0,
228            typed: TypedStats {
229                min: T::max_value(),
230                max: T::min_value(),
231                top_value: T::default(),
232                top_count: 0,
233                distinct_values: HashMap::with_hasher(FxBuildHasher),
234            }
235            .into(),
236        });
237    } else if array.all_invalid()? {
238        return Ok(IntegerStats {
239            src: array.clone(),
240            null_count: u32::try_from(array.len())?,
241            value_count: 0,
242            average_run_length: 0,
243            distinct_values_count: 0,
244            typed: TypedStats {
245                min: T::max_value(),
246                max: T::min_value(),
247                top_value: T::default(),
248                top_count: 0,
249                distinct_values: HashMap::with_hasher(FxBuildHasher),
250            }
251            .into(),
252        });
253    }
254
255    let validity = array.validity_mask()?;
256    let null_count = validity.false_count();
257    let value_count = validity.true_count();
258
259    // Initialize loop state
260    let head_idx = validity
261        .first()
262        .vortex_expect("All null masks have been handled before");
263    let buffer = array.to_buffer::<T>();
264    let head = buffer[head_idx];
265
266    let mut loop_state = LoopState {
267        distinct_values: if count_distinct_values {
268            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
269        } else {
270            HashMap::with_hasher(FxBuildHasher)
271        },
272        prev: head,
273        runs: 1,
274    };
275
276    let sliced = buffer.slice(head_idx..array.len());
277    let mut chunks = sliced.as_slice().chunks_exact(64);
278    match validity.bit_buffer() {
279        AllOr::All => {
280            for chunk in &mut chunks {
281                inner_loop_nonnull(
282                    chunk.try_into().ok().vortex_expect("chunk size must be 64"),
283                    count_distinct_values,
284                    &mut loop_state,
285                )
286            }
287            let remainder = chunks.remainder();
288            inner_loop_naive(
289                remainder,
290                count_distinct_values,
291                &BitBuffer::new_set(remainder.len()),
292                &mut loop_state,
293            );
294        }
295        AllOr::None => unreachable!("All invalid arrays have been handled before"),
296        AllOr::Some(v) => {
297            let mask = v.slice(head_idx..array.len());
298            let mut offset = 0;
299            for chunk in &mut chunks {
300                let validity = mask.slice(offset..(offset + 64));
301                offset += 64;
302
303                match validity.true_count() {
304                    // All nulls -> no stats to update
305                    0 => continue,
306                    // Inner loop for when validity check can be elided
307                    64 => inner_loop_nonnull(
308                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
309                        count_distinct_values,
310                        &mut loop_state,
311                    ),
312                    // Inner loop for when we need to check validity
313                    _ => inner_loop_nullable(
314                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
315                        count_distinct_values,
316                        &validity,
317                        &mut loop_state,
318                    ),
319                }
320            }
321            // Final iteration, run naive loop
322            let remainder = chunks.remainder();
323            inner_loop_naive(
324                remainder,
325                count_distinct_values,
326                &mask.slice(offset..(offset + remainder.len())),
327                &mut loop_state,
328            );
329        }
330    }
331
332    let (top_value, top_count) = if count_distinct_values {
333        let (&top_value, &top_count) = loop_state
334            .distinct_values
335            .iter()
336            .max_by_key(|&(_, &count)| count)
337            .vortex_expect("non-empty");
338        (top_value.0, top_count)
339    } else {
340        (T::default(), 0)
341    };
342
343    let runs = loop_state.runs;
344    let distinct_values_count = if count_distinct_values {
345        u32::try_from(loop_state.distinct_values.len())?
346    } else {
347        u32::MAX
348    };
349
350    let min = array
351        .statistics()
352        .compute_as::<T>(Stat::Min)
353        .vortex_expect("min should be computed");
354
355    let max = array
356        .statistics()
357        .compute_as::<T>(Stat::Max)
358        .vortex_expect("max should be computed");
359
360    let typed = TypedStats {
361        min,
362        max,
363        distinct_values: loop_state.distinct_values,
364        top_value,
365        top_count,
366    };
367
368    let null_count = u32::try_from(null_count)?;
369    let value_count = u32::try_from(value_count)?;
370
371    Ok(IntegerStats {
372        src: array.clone(),
373        null_count,
374        value_count,
375        average_run_length: value_count / runs,
376        distinct_values_count,
377        typed: typed.into(),
378    })
379}
380
381struct LoopState<T> {
382    prev: T,
383    runs: u32,
384    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
385}
386
387#[inline(always)]
388fn inner_loop_nonnull<T: IntegerPType>(
389    values: &[T; 64],
390    count_distinct_values: bool,
391    state: &mut LoopState<T>,
392) where
393    NativeValue<T>: Eq + Hash,
394{
395    for &value in values {
396        if count_distinct_values {
397            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
398        }
399
400        if value != state.prev {
401            state.prev = value;
402            state.runs += 1;
403        }
404    }
405}
406
407#[inline(always)]
408fn inner_loop_nullable<T: IntegerPType>(
409    values: &[T; 64],
410    count_distinct_values: bool,
411    is_valid: &BitBuffer,
412    state: &mut LoopState<T>,
413) where
414    NativeValue<T>: Eq + Hash,
415{
416    for (idx, &value) in values.iter().enumerate() {
417        if is_valid.value(idx) {
418            if count_distinct_values {
419                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
420            }
421
422            if value != state.prev {
423                state.prev = value;
424                state.runs += 1;
425            }
426        }
427    }
428}
429
430#[inline(always)]
431fn inner_loop_naive<T: IntegerPType>(
432    values: &[T],
433    count_distinct_values: bool,
434    is_valid: &BitBuffer,
435    state: &mut LoopState<T>,
436) where
437    NativeValue<T>: Eq + Hash,
438{
439    for (idx, &value) in values.iter().enumerate() {
440        if is_valid.value(idx) {
441            if count_distinct_values {
442                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
443            }
444
445            if value != state.prev {
446                state.prev = value;
447                state.runs += 1;
448            }
449        }
450    }
451}
452
453#[cfg(test)]
454mod tests {
455    use std::iter;
456
457    use vortex_array::arrays::PrimitiveArray;
458    use vortex_array::validity::Validity;
459    use vortex_buffer::BitBuffer;
460    use vortex_buffer::Buffer;
461    use vortex_buffer::buffer;
462    use vortex_error::VortexResult;
463
464    use super::IntegerStats;
465    use super::typed_int_stats;
466    use crate::CompressorStats;
467
468    #[test]
469    fn test_naive_count_distinct_values() -> VortexResult<()> {
470        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
471        let stats = typed_int_stats::<u8>(&array, true)?;
472        assert_eq!(stats.distinct_values_count, 2);
473        Ok(())
474    }
475
476    #[test]
477    fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
478        let array = PrimitiveArray::new(
479            buffer![217u8, 0],
480            Validity::from(BitBuffer::from(vec![true, false])),
481        );
482        let stats = typed_int_stats::<u8>(&array, true)?;
483        assert_eq!(stats.distinct_values_count, 1);
484        Ok(())
485    }
486
487    #[test]
488    fn test_count_distinct_values() -> VortexResult<()> {
489        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
490        let stats = typed_int_stats::<u8>(&array, true)?;
491        assert_eq!(stats.distinct_values_count, 128);
492        Ok(())
493    }
494
495    #[test]
496    fn test_count_distinct_values_nullable() -> VortexResult<()> {
497        let array = PrimitiveArray::new(
498            (0..128u8).collect::<Buffer<u8>>(),
499            Validity::from(BitBuffer::from_iter(
500                iter::repeat_n(vec![true, false], 64).flatten(),
501            )),
502        );
503        let stats = typed_int_stats::<u8>(&array, true)?;
504        assert_eq!(stats.distinct_values_count, 64);
505        Ok(())
506    }
507
508    #[test]
509    fn test_integer_stats_leading_nulls() {
510        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
511
512        let stats = IntegerStats::generate(&ints);
513
514        assert_eq!(stats.value_count, 2);
515        assert_eq!(stats.null_count, 1);
516        assert_eq!(stats.average_run_length, 1);
517        assert_eq!(stats.distinct_values_count, 2);
518    }
519}