Skip to main content

vortex_compressor/stats/
integer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Integer compression statistics.
5
6use std::hash::Hash;
7
8use num_traits::PrimInt;
9use rustc_hash::FxBuildHasher;
10use vortex_array::LEGACY_SESSION;
11use vortex_array::VortexSessionExecute;
12use vortex_array::arrays::PrimitiveArray;
13use vortex_array::arrays::primitive::NativeValue;
14use vortex_array::dtype::IntegerPType;
15use vortex_array::expr::stats::Stat;
16use vortex_array::match_each_integer_ptype;
17use vortex_array::scalar::PValue;
18use vortex_array::scalar::Scalar;
19use vortex_buffer::BitBuffer;
20use vortex_error::VortexError;
21use vortex_error::VortexExpect;
22use vortex_error::VortexResult;
23use vortex_mask::AllOr;
24use vortex_utils::aliases::hash_map::HashMap;
25
26use super::GenerateStatsOptions;
27
28/// Information about the distinct values in an integer array.
29#[derive(Debug, Clone)]
30pub struct DistinctInfo<T> {
31    /// The unique values and their occurrences.
32    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
33    /// The count of unique values. This _must_ be non-zero.
34    distinct_count: u32,
35    /// The most frequent value.
36    most_frequent_value: T,
37    /// The number of times the most frequent value occurs.
38    top_frequency: u32,
39}
40
41impl<T> DistinctInfo<T> {
42    /// Returns a reference to the distinct values map.
43    pub fn distinct_values(&self) -> &HashMap<NativeValue<T>, u32, FxBuildHasher> {
44        &self.distinct_values
45    }
46}
47
48/// Typed statistics for a specific integer type.
49#[derive(Debug, Clone)]
50pub struct TypedStats<T> {
51    /// The minimum value.
52    min: T,
53    /// The maximum value.
54    max: T,
55    /// Distinct value information, or `None` if not computed.
56    distinct: Option<DistinctInfo<T>>,
57}
58
59impl<T> TypedStats<T> {
60    /// Returns the distinct value information, if computed.
61    pub fn distinct(&self) -> Option<&DistinctInfo<T>> {
62        self.distinct.as_ref()
63    }
64}
65
66impl<T> TypedStats<T> {
67    /// Get the count of distinct values, if we have computed it already.
68    fn distinct_count(&self) -> Option<u32> {
69        Some(self.distinct.as_ref()?.distinct_count)
70    }
71
72    /// Get the most commonly occurring value and its count, if we have computed it already.
73    fn most_frequent_value_and_count(&self) -> Option<(&T, u32)> {
74        let distinct = self.distinct.as_ref()?;
75        Some((&distinct.most_frequent_value, distinct.top_frequency))
76    }
77}
78
79/// Type-erased container for one of the [`TypedStats`] variants.
80///
81/// Building the `TypedStats` is considerably faster and cheaper than building a type-erased
82/// set of stats. We then perform a variety of access methods on them.
83#[derive(Clone, Debug)]
84pub enum ErasedStats {
85    /// Stats for `u8` arrays.
86    U8(TypedStats<u8>),
87    /// Stats for `u16` arrays.
88    U16(TypedStats<u16>),
89    /// Stats for `u32` arrays.
90    U32(TypedStats<u32>),
91    /// Stats for `u64` arrays.
92    U64(TypedStats<u64>),
93    /// Stats for `i8` arrays.
94    I8(TypedStats<i8>),
95    /// Stats for `i16` arrays.
96    I16(TypedStats<i16>),
97    /// Stats for `i32` arrays.
98    I32(TypedStats<i32>),
99    /// Stats for `i64` arrays.
100    I64(TypedStats<i64>),
101}
102
103impl ErasedStats {
104    /// Returns `true` if the minimum value is zero.
105    pub fn min_is_zero(&self) -> bool {
106        match &self {
107            ErasedStats::U8(x) => x.min == 0,
108            ErasedStats::U16(x) => x.min == 0,
109            ErasedStats::U32(x) => x.min == 0,
110            ErasedStats::U64(x) => x.min == 0,
111            ErasedStats::I8(x) => x.min == 0,
112            ErasedStats::I16(x) => x.min == 0,
113            ErasedStats::I32(x) => x.min == 0,
114            ErasedStats::I64(x) => x.min == 0,
115        }
116    }
117
118    /// Returns `true` if the minimum value is negative.
119    pub fn min_is_negative(&self) -> bool {
120        match &self {
121            ErasedStats::U8(_)
122            | ErasedStats::U16(_)
123            | ErasedStats::U32(_)
124            | ErasedStats::U64(_) => false,
125            ErasedStats::I8(x) => x.min < 0,
126            ErasedStats::I16(x) => x.min < 0,
127            ErasedStats::I32(x) => x.min < 0,
128            ErasedStats::I64(x) => x.min < 0,
129        }
130    }
131
132    /// Difference between max and min.
133    pub fn max_minus_min(&self) -> u64 {
134        match &self {
135            ErasedStats::U8(x) => (x.max - x.min) as u64,
136            ErasedStats::U16(x) => (x.max - x.min) as u64,
137            ErasedStats::U32(x) => (x.max - x.min) as u64,
138            ErasedStats::U64(x) => x.max - x.min,
139            ErasedStats::I8(x) => (x.max as i16 - x.min as i16) as u64,
140            ErasedStats::I16(x) => (x.max as i32 - x.min as i32) as u64,
141            ErasedStats::I32(x) => (x.max as i64 - x.min as i64) as u64,
142            ErasedStats::I64(x) => u64::try_from(x.max as i128 - x.min as i128)
143                .vortex_expect("max minus min result bigger than u64"),
144        }
145    }
146
147    /// Returns the ilog2 of the max value when transmuted to unsigned, or `None` if zero.
148    ///
149    /// This matches how BitPacking computes bit width: it reinterprets signed values as
150    /// unsigned (preserving bit pattern) and uses `leading_zeros`. For non-negative signed
151    /// values, the transmuted value equals the original value.
152    ///
153    /// This is used to determine if FOR encoding would reduce bit width compared to
154    /// direct BitPacking. If `max_ilog2() == max_minus_min_ilog2()`, FOR doesn't help.
155    pub fn max_ilog2(&self) -> Option<u32> {
156        match &self {
157            ErasedStats::U8(x) => x.max.checked_ilog2(),
158            ErasedStats::U16(x) => x.max.checked_ilog2(),
159            ErasedStats::U32(x) => x.max.checked_ilog2(),
160            ErasedStats::U64(x) => x.max.checked_ilog2(),
161            // Transmute signed to unsigned (bit pattern preserved) to match BitPacking behavior.
162            ErasedStats::I8(x) => (x.max as u8).checked_ilog2(),
163            ErasedStats::I16(x) => (x.max as u16).checked_ilog2(),
164            ErasedStats::I32(x) => (x.max as u32).checked_ilog2(),
165            ErasedStats::I64(x) => (x.max as u64).checked_ilog2(),
166        }
167    }
168
169    /// Get the count of distinct values, if we have computed it already.
170    pub fn distinct_count(&self) -> Option<u32> {
171        match &self {
172            ErasedStats::U8(x) => x.distinct_count(),
173            ErasedStats::U16(x) => x.distinct_count(),
174            ErasedStats::U32(x) => x.distinct_count(),
175            ErasedStats::U64(x) => x.distinct_count(),
176            ErasedStats::I8(x) => x.distinct_count(),
177            ErasedStats::I16(x) => x.distinct_count(),
178            ErasedStats::I32(x) => x.distinct_count(),
179            ErasedStats::I64(x) => x.distinct_count(),
180        }
181    }
182
183    /// Get the most commonly occurring value and its count.
184    pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
185        match &self {
186            ErasedStats::U8(x) => {
187                let (top_value, count) = x.most_frequent_value_and_count()?;
188                Some(((*top_value).into(), count))
189            }
190            ErasedStats::U16(x) => {
191                let (top_value, count) = x.most_frequent_value_and_count()?;
192                Some(((*top_value).into(), count))
193            }
194            ErasedStats::U32(x) => {
195                let (top_value, count) = x.most_frequent_value_and_count()?;
196                Some(((*top_value).into(), count))
197            }
198            ErasedStats::U64(x) => {
199                let (top_value, count) = x.most_frequent_value_and_count()?;
200                Some(((*top_value).into(), count))
201            }
202            ErasedStats::I8(x) => {
203                let (top_value, count) = x.most_frequent_value_and_count()?;
204                Some(((*top_value).into(), count))
205            }
206            ErasedStats::I16(x) => {
207                let (top_value, count) = x.most_frequent_value_and_count()?;
208                Some(((*top_value).into(), count))
209            }
210            ErasedStats::I32(x) => {
211                let (top_value, count) = x.most_frequent_value_and_count()?;
212                Some(((*top_value).into(), count))
213            }
214            ErasedStats::I64(x) => {
215                let (top_value, count) = x.most_frequent_value_and_count()?;
216                Some(((*top_value).into(), count))
217            }
218        }
219    }
220}
221
222/// Implements `From<TypedStats<$T>>` for [`ErasedStats`].
223macro_rules! impl_from_typed {
224    ($T:ty, $variant:path) => {
225        impl From<TypedStats<$T>> for ErasedStats {
226            fn from(typed: TypedStats<$T>) -> Self {
227                $variant(typed)
228            }
229        }
230    };
231}
232
233impl_from_typed!(u8, ErasedStats::U8);
234impl_from_typed!(u16, ErasedStats::U16);
235impl_from_typed!(u32, ErasedStats::U32);
236impl_from_typed!(u64, ErasedStats::U64);
237impl_from_typed!(i8, ErasedStats::I8);
238impl_from_typed!(i16, ErasedStats::I16);
239impl_from_typed!(i32, ErasedStats::I32);
240impl_from_typed!(i64, ErasedStats::I64);
241
242/// Array of integers and relevant stats for compression.
243#[derive(Clone, Debug)]
244pub struct IntegerStats {
245    /// Cache for `validity.false_count()`.
246    null_count: u32,
247    /// Cache for `validity.true_count()`.
248    value_count: u32,
249    /// The average run length.
250    average_run_length: u32,
251    /// Type-erased typed statistics.
252    erased: ErasedStats,
253}
254
255impl IntegerStats {
256    /// Generates stats, returning an error on failure.
257    fn generate_opts_fallible(
258        input: &PrimitiveArray,
259        opts: GenerateStatsOptions,
260    ) -> VortexResult<Self> {
261        match_each_integer_ptype!(input.ptype(), |T| {
262            typed_int_stats::<T>(input, opts.count_distinct_values)
263        })
264    }
265
266    /// Get the count of distinct values, if we have computed it already.
267    pub fn distinct_count(&self) -> Option<u32> {
268        self.erased.distinct_count()
269    }
270
271    /// Get the most commonly occurring value and its count, if we have computed it already.
272    pub fn most_frequent_value_and_count(&self) -> Option<(PValue, u32)> {
273        self.erased.most_frequent_value_and_count()
274    }
275}
276
277impl IntegerStats {
278    /// Generates stats with default options.
279    pub fn generate(input: &PrimitiveArray) -> Self {
280        Self::generate_opts(input, GenerateStatsOptions::default())
281    }
282
283    /// Generates stats with provided options.
284    pub fn generate_opts(input: &PrimitiveArray, opts: GenerateStatsOptions) -> Self {
285        Self::generate_opts_fallible(input, opts)
286            .vortex_expect("IntegerStats::generate_opts should not fail")
287    }
288
289    /// Returns the number of null values.
290    pub fn null_count(&self) -> u32 {
291        self.null_count
292    }
293
294    /// Returns the number of non-null values.
295    pub fn value_count(&self) -> u32 {
296        self.value_count
297    }
298
299    /// Returns the average run length.
300    pub fn average_run_length(&self) -> u32 {
301        self.average_run_length
302    }
303
304    /// Returns the type-erased typed statistics.
305    pub fn erased(&self) -> &ErasedStats {
306        &self.erased
307    }
308}
309
310/// Computes typed integer statistics for a specific integer type.
311fn typed_int_stats<T>(
312    array: &PrimitiveArray,
313    count_distinct_values: bool,
314) -> VortexResult<IntegerStats>
315where
316    T: IntegerPType + PrimInt + for<'a> TryFrom<&'a Scalar, Error = VortexError>,
317    TypedStats<T>: Into<ErasedStats>,
318    NativeValue<T>: Eq + Hash,
319{
320    // Special case: empty array.
321    if array.is_empty() {
322        return Ok(IntegerStats {
323            null_count: 0,
324            value_count: 0,
325            average_run_length: 0,
326            erased: TypedStats {
327                min: T::max_value(),
328                max: T::min_value(),
329                distinct: None,
330            }
331            .into(),
332        });
333    }
334
335    let mut ctx = LEGACY_SESSION.create_execution_ctx();
336    if array.all_invalid(&mut ctx)? {
337        return Ok(IntegerStats {
338            null_count: u32::try_from(array.len())?,
339            value_count: 0,
340            average_run_length: 0,
341            erased: TypedStats {
342                min: T::max_value(),
343                max: T::min_value(),
344                distinct: Some(DistinctInfo {
345                    distinct_values: HashMap::with_capacity_and_hasher(0, FxBuildHasher),
346                    distinct_count: 0,
347                    most_frequent_value: T::zero(),
348                    top_frequency: 0,
349                }),
350            }
351            .into(),
352        });
353    }
354
355    let validity = array.as_ref().validity()?.to_mask(
356        array.as_ref().len(),
357        &mut LEGACY_SESSION.create_execution_ctx(),
358    )?;
359    let null_count = validity.false_count();
360    let value_count = validity.true_count();
361
362    // Initialize loop state.
363    let head_idx = validity
364        .first()
365        .vortex_expect("All null masks have been handled before");
366    let buffer = array.to_buffer::<T>();
367    let head = buffer[head_idx];
368
369    let mut loop_state = LoopState {
370        distinct_values: if count_distinct_values {
371            HashMap::with_capacity_and_hasher(array.len() / 2, FxBuildHasher)
372        } else {
373            HashMap::with_hasher(FxBuildHasher)
374        },
375        prev: head,
376        runs: 1,
377    };
378
379    let sliced = buffer.slice(head_idx..array.len());
380    let mut chunks = sliced.as_slice().chunks_exact(64);
381    match validity.bit_buffer() {
382        AllOr::All => {
383            for chunk in &mut chunks {
384                inner_loop_nonnull(
385                    chunk.try_into().ok().vortex_expect("chunk size must be 64"),
386                    count_distinct_values,
387                    &mut loop_state,
388                )
389            }
390            let remainder = chunks.remainder();
391            inner_loop_naive(
392                remainder,
393                count_distinct_values,
394                &BitBuffer::new_set(remainder.len()),
395                &mut loop_state,
396            );
397        }
398        AllOr::None => unreachable!("All invalid arrays have been handled before"),
399        AllOr::Some(v) => {
400            let mask = v.slice(head_idx..array.len());
401            let mut offset = 0;
402            for chunk in &mut chunks {
403                let validity = mask.slice(offset..(offset + 64));
404                offset += 64;
405
406                match validity.true_count() {
407                    // All nulls -> no stats to update.
408                    0 => continue,
409                    // Inner loop for when validity check can be elided.
410                    64 => inner_loop_nonnull(
411                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
412                        count_distinct_values,
413                        &mut loop_state,
414                    ),
415                    // Inner loop for when we need to check validity.
416                    _ => inner_loop_nullable(
417                        chunk.try_into().ok().vortex_expect("chunk size must be 64"),
418                        count_distinct_values,
419                        &validity,
420                        &mut loop_state,
421                    ),
422                }
423            }
424            // Final iteration, run naive loop.
425            let remainder = chunks.remainder();
426            inner_loop_naive(
427                remainder,
428                count_distinct_values,
429                &mask.slice(offset..(offset + remainder.len())),
430                &mut loop_state,
431            );
432        }
433    }
434
435    let runs = loop_state.runs;
436
437    let array_ref = array.as_ref();
438    let min = array_ref
439        .statistics()
440        .compute_as::<T>(Stat::Min, &mut ctx)
441        .vortex_expect("min should be computed");
442
443    let max = array_ref
444        .statistics()
445        .compute_as::<T>(Stat::Max, &mut ctx)
446        .vortex_expect("max should be computed");
447
448    let distinct = count_distinct_values.then(|| {
449        let (&top_value, &top_count) = loop_state
450            .distinct_values
451            .iter()
452            .max_by_key(|&(_, &count)| count)
453            .vortex_expect("we know this is non-empty");
454
455        DistinctInfo {
456            distinct_count: u32::try_from(loop_state.distinct_values.len())
457                .vortex_expect("there are more than `u32::MAX` distinct values"),
458            most_frequent_value: top_value.0,
459            top_frequency: top_count,
460            distinct_values: loop_state.distinct_values,
461        }
462    });
463
464    let typed = TypedStats { min, max, distinct };
465
466    let null_count = u32::try_from(null_count)?;
467    let value_count = u32::try_from(value_count)?;
468
469    Ok(IntegerStats {
470        null_count,
471        value_count,
472        average_run_length: value_count / runs,
473        erased: typed.into(),
474    })
475}
476
477/// Internal loop state for integer stats computation.
478struct LoopState<T> {
479    /// The previous value seen.
480    prev: T,
481    /// The run count.
482    runs: u32,
483    /// The distinct values map.
484    distinct_values: HashMap<NativeValue<T>, u32, FxBuildHasher>,
485}
486
487/// Inner loop for non-null chunks of 64 values.
488#[inline(always)]
489fn inner_loop_nonnull<T: IntegerPType>(
490    values: &[T; 64],
491    count_distinct_values: bool,
492    state: &mut LoopState<T>,
493) where
494    NativeValue<T>: Eq + Hash,
495{
496    for &value in values {
497        if count_distinct_values {
498            *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
499        }
500
501        if value != state.prev {
502            state.prev = value;
503            state.runs += 1;
504        }
505    }
506}
507
508/// Inner loop for nullable chunks of 64 values.
509#[inline(always)]
510fn inner_loop_nullable<T: IntegerPType>(
511    values: &[T; 64],
512    count_distinct_values: bool,
513    is_valid: &BitBuffer,
514    state: &mut LoopState<T>,
515) where
516    NativeValue<T>: Eq + Hash,
517{
518    for (idx, &value) in values.iter().enumerate() {
519        if is_valid.value(idx) {
520            if count_distinct_values {
521                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
522            }
523
524            if value != state.prev {
525                state.prev = value;
526                state.runs += 1;
527            }
528        }
529    }
530}
531
532/// Fallback inner loop for remainder values.
533#[inline(always)]
534fn inner_loop_naive<T: IntegerPType>(
535    values: &[T],
536    count_distinct_values: bool,
537    is_valid: &BitBuffer,
538    state: &mut LoopState<T>,
539) where
540    NativeValue<T>: Eq + Hash,
541{
542    for (idx, &value) in values.iter().enumerate() {
543        if is_valid.value(idx) {
544            if count_distinct_values {
545                *state.distinct_values.entry(NativeValue(value)).or_insert(0) += 1;
546            }
547
548            if value != state.prev {
549                state.prev = value;
550                state.runs += 1;
551            }
552        }
553    }
554}
555
556#[cfg(test)]
557mod tests {
558    use std::iter;
559
560    use vortex_array::arrays::PrimitiveArray;
561    use vortex_array::validity::Validity;
562    use vortex_buffer::BitBuffer;
563    use vortex_buffer::Buffer;
564    use vortex_buffer::buffer;
565    use vortex_error::VortexResult;
566
567    use super::IntegerStats;
568    use super::typed_int_stats;
569
570    #[test]
571    fn test_naive_count_distinct_values() -> VortexResult<()> {
572        let array = PrimitiveArray::new(buffer![217u8, 0], Validity::NonNullable);
573        let stats = typed_int_stats::<u8>(&array, true)?;
574        assert_eq!(stats.distinct_count().unwrap(), 2);
575        Ok(())
576    }
577
578    #[test]
579    fn test_naive_count_distinct_values_nullable() -> VortexResult<()> {
580        let array = PrimitiveArray::new(
581            buffer![217u8, 0],
582            Validity::from(BitBuffer::from(vec![true, false])),
583        );
584        let stats = typed_int_stats::<u8>(&array, true)?;
585        assert_eq!(stats.distinct_count().unwrap(), 1);
586        Ok(())
587    }
588
589    #[test]
590    fn test_count_distinct_values() -> VortexResult<()> {
591        let array = PrimitiveArray::new((0..128u8).collect::<Buffer<u8>>(), Validity::NonNullable);
592        let stats = typed_int_stats::<u8>(&array, true)?;
593        assert_eq!(stats.distinct_count().unwrap(), 128);
594        Ok(())
595    }
596
597    #[test]
598    fn test_count_distinct_values_nullable() -> VortexResult<()> {
599        let array = PrimitiveArray::new(
600            (0..128u8).collect::<Buffer<u8>>(),
601            Validity::from(BitBuffer::from_iter(
602                iter::repeat_n(vec![true, false], 64).flatten(),
603            )),
604        );
605        let stats = typed_int_stats::<u8>(&array, true)?;
606        assert_eq!(stats.distinct_count().unwrap(), 64);
607        Ok(())
608    }
609
610    #[test]
611    fn test_integer_stats_leading_nulls() {
612        let ints = PrimitiveArray::new(buffer![0, 1, 2], Validity::from_iter([false, true, true]));
613
614        let stats = IntegerStats::generate_opts(
615            &ints,
616            crate::stats::GenerateStatsOptions {
617                count_distinct_values: true,
618            },
619        );
620
621        assert_eq!(stats.value_count, 2);
622        assert_eq!(stats.null_count, 1);
623        assert_eq!(stats.average_run_length, 1);
624        assert_eq!(stats.distinct_count().unwrap(), 2);
625    }
626}