vortex_array/stats/
mod.rs1use std::fmt::{Debug, Display, Formatter};
4use std::hash::Hash;
5
6use arrow_buffer::bit_iterator::BitIterator;
7use arrow_buffer::{BooleanBufferBuilder, MutableBuffer};
8use enum_iterator::{Sequence, last};
9use log::debug;
10use num_enum::{IntoPrimitive, TryFromPrimitive};
11pub use stats_set::*;
12use vortex_dtype::Nullability::{NonNullable, Nullable};
13use vortex_dtype::{DType, PType};
14
15mod array;
16mod bound;
17pub mod flatbuffers;
18mod precision;
19mod stat_bound;
20mod stats_set;
21mod traits;
22
23pub use array::*;
24pub use bound::{LowerBound, UpperBound};
25pub use precision::Precision;
26pub use stat_bound::*;
27pub use traits::*;
28use vortex_error::VortexExpect;
29
30pub const PRUNING_STATS: &[Stat] = &[Stat::Min, Stat::Max, Stat::Sum, Stat::NullCount];
33
34pub const STATS_TO_WRITE: &[Stat] = &[
36 Stat::Min,
37 Stat::Max,
38 Stat::NullCount,
39 Stat::Sum,
40 Stat::IsConstant,
41 Stat::IsSorted,
42 Stat::IsStrictSorted,
43 Stat::UncompressedSizeInBytes,
44];
45
46#[derive(
47 Debug,
48 Clone,
49 Copy,
50 PartialEq,
51 Eq,
52 PartialOrd,
53 Ord,
54 Hash,
55 Sequence,
56 IntoPrimitive,
57 TryFromPrimitive,
58)]
59#[repr(u8)]
60pub enum Stat {
61 IsConstant = 0,
64 IsSorted = 1,
66 IsStrictSorted = 2,
68 Max = 3,
70 Min = 4,
72 Sum = 5,
74 NullCount = 6,
76 UncompressedSizeInBytes = 7,
78}
79
80pub struct Max;
83pub struct Min;
84pub struct Sum;
85pub struct IsConstant;
86pub struct IsSorted;
87pub struct IsStrictSorted;
88pub struct NullCount;
89pub struct UncompressedSizeInBytes;
90
91impl StatType<bool> for IsConstant {
92 type Bound = Precision<bool>;
93
94 const STAT: Stat = Stat::IsConstant;
95}
96
97impl<T: PartialOrd + Clone> StatType<T> for IsSorted {
98 type Bound = Precision<T>;
99
100 const STAT: Stat = Stat::IsSorted;
101}
102
103impl<T: PartialOrd + Clone> StatType<T> for IsStrictSorted {
104 type Bound = Precision<T>;
105
106 const STAT: Stat = Stat::IsStrictSorted;
107}
108
109impl<T: PartialOrd + Clone> StatType<T> for NullCount {
110 type Bound = UpperBound<T>;
111
112 const STAT: Stat = Stat::NullCount;
113}
114
115impl<T: PartialOrd + Clone> StatType<T> for UncompressedSizeInBytes {
116 type Bound = UpperBound<T>;
117
118 const STAT: Stat = Stat::UncompressedSizeInBytes;
119}
120
121impl<T: PartialOrd + Clone + Debug> StatType<T> for Max {
122 type Bound = UpperBound<T>;
123
124 const STAT: Stat = Stat::Max;
125}
126
127impl<T: PartialOrd + Clone + Debug> StatType<T> for Min {
128 type Bound = LowerBound<T>;
129
130 const STAT: Stat = Stat::Min;
131}
132
133impl<T: PartialOrd + Clone + Debug> StatType<T> for Sum {
134 type Bound = Precision<T>;
135
136 const STAT: Stat = Stat::Sum;
137}
138
139impl Stat {
140 pub fn is_commutative(&self) -> bool {
143 match self {
145 Stat::IsConstant
146 | Stat::Max
147 | Stat::Min
148 | Stat::NullCount
149 | Stat::Sum
150 | Stat::UncompressedSizeInBytes => true,
151 Stat::IsSorted | Stat::IsStrictSorted => false,
152 }
153 }
154
155 pub fn has_same_dtype_as_array(&self) -> bool {
157 matches!(self, Stat::Min | Stat::Max)
158 }
159
160 pub fn dtype(&self, data_type: &DType) -> Option<DType> {
161 Some(match self {
162 Stat::IsConstant => DType::Bool(NonNullable),
163 Stat::IsSorted => DType::Bool(NonNullable),
164 Stat::IsStrictSorted => DType::Bool(NonNullable),
165 Stat::Max => data_type.clone(),
166 Stat::Min => data_type.clone(),
167 Stat::NullCount => DType::Primitive(PType::U64, NonNullable),
168 Stat::UncompressedSizeInBytes => DType::Primitive(PType::U64, NonNullable),
169 Stat::Sum => {
170 match data_type {
174 DType::Bool(_) => DType::Primitive(PType::U64, Nullable),
175 DType::Primitive(ptype, _) => match ptype {
176 PType::U8 | PType::U16 | PType::U32 | PType::U64 => {
177 DType::Primitive(PType::U64, Nullable)
178 }
179 PType::I8 | PType::I16 | PType::I32 | PType::I64 => {
180 DType::Primitive(PType::I64, Nullable)
181 }
182 PType::F16 | PType::F32 | PType::F64 => {
183 DType::Primitive(PType::F64, NonNullable)
185 }
186 },
187 DType::Extension(ext_dtype) => self.dtype(ext_dtype.storage_dtype())?,
188 DType::Null
190 | DType::Utf8(_)
191 | DType::Binary(_)
192 | DType::Struct(..)
193 | DType::List(..) => return None,
194 }
195 }
196 })
197 }
198
199 pub fn name(&self) -> &str {
200 match self {
201 Self::IsConstant => "is_constant",
202 Self::IsSorted => "is_sorted",
203 Self::IsStrictSorted => "is_strict_sorted",
204 Self::Max => "max",
205 Self::Min => "min",
206 Self::NullCount => "null_count",
207 Self::UncompressedSizeInBytes => "uncompressed_size_in_bytes",
208 Stat::Sum => "sum",
209 }
210 }
211}
212
213pub fn as_stat_bitset_bytes(stats: &[Stat]) -> Vec<u8> {
214 let max_stat = u8::from(last::<Stat>().vortex_expect("last stat")) as usize;
215 let mut stat_bitset = BooleanBufferBuilder::new_from_buffer(
217 MutableBuffer::from_len_zeroed(max_stat.div_ceil(8)),
218 max_stat,
219 );
220 for stat in stats {
221 stat_bitset.set_bit(u8::from(*stat) as usize, true);
222 }
223
224 stat_bitset
225 .finish()
226 .into_inner()
227 .into_vec()
228 .unwrap_or_else(|b| b.to_vec())
229}
230
231pub fn stats_from_bitset_bytes(bytes: &[u8]) -> Vec<Stat> {
232 BitIterator::new(bytes, 0, bytes.len() * 8)
233 .enumerate()
234 .filter_map(|(i, b)| b.then_some(i))
235 .filter_map(|i| {
237 let Ok(stat) = u8::try_from(i) else {
238 debug!("invalid stat encountered: {i}");
239 return None;
240 };
241 Stat::try_from(stat).ok()
242 })
243 .collect::<Vec<_>>()
244}
245
246impl Display for Stat {
247 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
248 write!(f, "{}", self.name())
249 }
250}
251
252#[cfg(test)]
253mod test {
254 use enum_iterator::all;
255
256 use crate::array::Array;
257 use crate::arrays::PrimitiveArray;
258 use crate::stats::Stat;
259
260 #[test]
261 fn min_of_nulls_is_not_panic() {
262 let min = PrimitiveArray::from_option_iter::<i32, _>([None, None, None, None])
263 .statistics()
264 .compute_as::<i64>(Stat::Min);
265
266 assert_eq!(min, None);
267 }
268
269 #[test]
270 fn has_same_dtype_as_array() {
271 assert!(Stat::Min.has_same_dtype_as_array());
272 assert!(Stat::Max.has_same_dtype_as_array());
273 for stat in all::<Stat>().filter(|s| !matches!(s, Stat::Min | Stat::Max)) {
274 assert!(!stat.has_same_dtype_as_array());
275 }
276 }
277}