vortex_array/stats/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Traits and utilities to compute and access array statistics.
5
6use arrow_buffer::BooleanBufferBuilder;
7use arrow_buffer::MutableBuffer;
8use arrow_buffer::bit_iterator::BitIterator;
9use enum_iterator::last;
10use log::debug;
11pub use stats_set::*;
12
13mod array;
14pub mod flatbuffers;
15mod stats_set;
16
17pub use array::*;
18use vortex_error::VortexExpect;
19
20use crate::expr::stats::Stat;
21
22/// Statistics that are used for pruning files (i.e., we want to ensure they are computed when compressing/writing).
23/// Sum is included for boolean arrays.
24pub const PRUNING_STATS: &[Stat] = &[
25    Stat::Min,
26    Stat::Max,
27    Stat::Sum,
28    Stat::NullCount,
29    Stat::NaNCount,
30];
31
32pub fn as_stat_bitset_bytes(stats: &[Stat]) -> Vec<u8> {
33    let max_stat = u8::from(last::<Stat>().vortex_expect("last stat")) as usize + 1;
34    // TODO(ngates): use vortex-buffer::BitBuffer
35    let mut stat_bitset = BooleanBufferBuilder::new_from_buffer(
36        MutableBuffer::from_len_zeroed(max_stat.div_ceil(8)),
37        max_stat,
38    );
39    for stat in stats {
40        stat_bitset.set_bit(u8::from(*stat) as usize, true);
41    }
42
43    stat_bitset
44        .finish()
45        .into_inner()
46        .into_vec()
47        .unwrap_or_else(|b| b.to_vec())
48}
49
50pub fn stats_from_bitset_bytes(bytes: &[u8]) -> Vec<Stat> {
51    BitIterator::new(bytes, 0, bytes.len() * 8)
52        .enumerate()
53        .filter_map(|(i, b)| b.then_some(i))
54        // Filter out indices failing conversion, these are stats written by newer version of library
55        .filter_map(|i| {
56            let Ok(stat) = u8::try_from(i) else {
57                debug!("invalid stat encountered: {i}");
58                return None;
59            };
60            Stat::try_from(stat).ok()
61        })
62        .collect::<Vec<_>>()
63}