vortex_array/stats/
mod.rs1use std::fmt::{Debug, Display, Formatter};
4use std::hash::Hash;
5
6use arrow_buffer::bit_iterator::BitIterator;
7use arrow_buffer::{BooleanBufferBuilder, MutableBuffer};
8use enum_iterator::{Sequence, all, last};
9use log::debug;
10use num_enum::{IntoPrimitive, TryFromPrimitive};
11pub use stats_set::*;
12use vortex_dtype::Nullability::{NonNullable, Nullable};
13use vortex_dtype::{DType, PType};
14
15mod array;
16mod bound;
17pub mod flatbuffers;
18mod precision;
19mod stat_bound;
20mod stats_set;
21mod traits;
22
23pub use array::*;
24pub use bound::{LowerBound, UpperBound};
25pub use precision::Precision;
26pub use stat_bound::*;
27pub use traits::*;
28use vortex_error::VortexExpect;
29
30pub const PRUNING_STATS: &[Stat] = &[
33 Stat::Min,
34 Stat::Max,
35 Stat::Sum,
36 Stat::NullCount,
37 Stat::NaNCount,
38];
39
40#[derive(
41 Debug,
42 Clone,
43 Copy,
44 PartialEq,
45 Eq,
46 PartialOrd,
47 Ord,
48 Hash,
49 Sequence,
50 IntoPrimitive,
51 TryFromPrimitive,
52)]
53#[repr(u8)]
54pub enum Stat {
55 IsConstant = 0,
58 IsSorted = 1,
60 IsStrictSorted = 2,
62 Max = 3,
64 Min = 4,
66 Sum = 5,
68 NullCount = 6,
70 UncompressedSizeInBytes = 7,
72 NaNCount = 8,
74}
75
76pub struct Max;
79pub struct Min;
80pub struct Sum;
81pub struct IsConstant;
82pub struct IsSorted;
83pub struct IsStrictSorted;
84pub struct NullCount;
85pub struct UncompressedSizeInBytes;
86pub struct NaNCount;
87
88impl StatType<bool> for IsConstant {
89 type Bound = Precision<bool>;
90
91 const STAT: Stat = Stat::IsConstant;
92}
93
94impl StatType<bool> for IsSorted {
95 type Bound = Precision<bool>;
96
97 const STAT: Stat = Stat::IsSorted;
98}
99
100impl StatType<bool> for IsStrictSorted {
101 type Bound = Precision<bool>;
102
103 const STAT: Stat = Stat::IsStrictSorted;
104}
105
106impl<T: PartialOrd + Clone> StatType<T> for NullCount {
107 type Bound = UpperBound<T>;
108
109 const STAT: Stat = Stat::NullCount;
110}
111
112impl<T: PartialOrd + Clone> StatType<T> for UncompressedSizeInBytes {
113 type Bound = UpperBound<T>;
114
115 const STAT: Stat = Stat::UncompressedSizeInBytes;
116}
117
118impl<T: PartialOrd + Clone + Debug> StatType<T> for Max {
119 type Bound = UpperBound<T>;
120
121 const STAT: Stat = Stat::Max;
122}
123
124impl<T: PartialOrd + Clone + Debug> StatType<T> for Min {
125 type Bound = LowerBound<T>;
126
127 const STAT: Stat = Stat::Min;
128}
129
130impl<T: PartialOrd + Clone + Debug> StatType<T> for Sum {
131 type Bound = Precision<T>;
132
133 const STAT: Stat = Stat::Sum;
134}
135
136impl<T: PartialOrd + Clone> StatType<T> for NaNCount {
137 type Bound = UpperBound<T>;
138
139 const STAT: Stat = Stat::NaNCount;
140}
141
142impl Stat {
143 pub fn is_commutative(&self) -> bool {
146 match self {
148 Self::IsConstant
149 | Self::Max
150 | Self::Min
151 | Self::NullCount
152 | Self::Sum
153 | Self::NaNCount
154 | Self::UncompressedSizeInBytes => true,
155 Self::IsSorted | Self::IsStrictSorted => false,
156 }
157 }
158
159 pub fn has_same_dtype_as_array(&self) -> bool {
161 matches!(self, Stat::Min | Stat::Max)
162 }
163
164 pub fn dtype(&self, data_type: &DType) -> Option<DType> {
166 Some(match self {
167 Self::IsConstant => DType::Bool(NonNullable),
168 Self::IsSorted => DType::Bool(NonNullable),
169 Self::IsStrictSorted => DType::Bool(NonNullable),
170 Self::Max => data_type.clone(),
171 Self::Min => data_type.clone(),
172 Self::NullCount => DType::Primitive(PType::U64, NonNullable),
173 Self::UncompressedSizeInBytes => DType::Primitive(PType::U64, NonNullable),
174 Self::NaNCount => match data_type {
175 DType::Primitive(ptype, ..) if ptype.is_float() => {
176 DType::Primitive(PType::U64, NonNullable)
177 }
178 _ => return None,
180 },
181 Self::Sum => {
182 match data_type {
186 DType::Bool(_) => DType::Primitive(PType::U64, Nullable),
187 DType::Primitive(ptype, _) => match ptype {
188 PType::U8 | PType::U16 | PType::U32 | PType::U64 => {
189 DType::Primitive(PType::U64, Nullable)
190 }
191 PType::I8 | PType::I16 | PType::I32 | PType::I64 => {
192 DType::Primitive(PType::I64, Nullable)
193 }
194 PType::F16 | PType::F32 | PType::F64 => {
195 DType::Primitive(PType::F64, NonNullable)
197 }
198 },
199 DType::Extension(ext_dtype) => self.dtype(ext_dtype.storage_dtype())?,
200 DType::Null
202 | DType::Decimal(..)
204 | DType::Utf8(_)
205 | DType::Binary(_)
206 | DType::Struct(..)
207 | DType::List(..) => return None,
208 }
209 }
210 })
211 }
212
213 pub fn name(&self) -> &str {
214 match self {
215 Self::IsConstant => "is_constant",
216 Self::IsSorted => "is_sorted",
217 Self::IsStrictSorted => "is_strict_sorted",
218 Self::Max => "max",
219 Self::Min => "min",
220 Self::NullCount => "null_count",
221 Self::UncompressedSizeInBytes => "uncompressed_size_in_bytes",
222 Self::Sum => "sum",
223 Self::NaNCount => "nan_count",
224 }
225 }
226
227 pub fn all() -> impl Iterator<Item = Stat> {
228 all::<Self>()
229 }
230}
231
232pub fn as_stat_bitset_bytes(stats: &[Stat]) -> Vec<u8> {
233 let max_stat = u8::from(last::<Stat>().vortex_expect("last stat")) as usize + 1;
234 let mut stat_bitset = BooleanBufferBuilder::new_from_buffer(
236 MutableBuffer::from_len_zeroed(max_stat.div_ceil(8)),
237 max_stat,
238 );
239 for stat in stats {
240 stat_bitset.set_bit(u8::from(*stat) as usize, true);
241 }
242
243 stat_bitset
244 .finish()
245 .into_inner()
246 .into_vec()
247 .unwrap_or_else(|b| b.to_vec())
248}
249
250pub fn stats_from_bitset_bytes(bytes: &[u8]) -> Vec<Stat> {
251 BitIterator::new(bytes, 0, bytes.len() * 8)
252 .enumerate()
253 .filter_map(|(i, b)| b.then_some(i))
254 .filter_map(|i| {
256 let Ok(stat) = u8::try_from(i) else {
257 debug!("invalid stat encountered: {i}");
258 return None;
259 };
260 Stat::try_from(stat).ok()
261 })
262 .collect::<Vec<_>>()
263}
264
265impl Display for Stat {
266 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
267 write!(f, "{}", self.name())
268 }
269}
270
271#[cfg(test)]
272mod test {
273 use enum_iterator::all;
274
275 use crate::arrays::PrimitiveArray;
276 use crate::stats::Stat;
277
278 #[test]
279 fn min_of_nulls_is_not_panic() {
280 let min = PrimitiveArray::from_option_iter::<i32, _>([None, None, None, None])
281 .statistics()
282 .compute_as::<i64>(Stat::Min);
283
284 assert_eq!(min, None);
285 }
286
287 #[test]
288 fn has_same_dtype_as_array() {
289 assert!(Stat::Min.has_same_dtype_as_array());
290 assert!(Stat::Max.has_same_dtype_as_array());
291 for stat in all::<Stat>().filter(|s| !matches!(s, Stat::Min | Stat::Max)) {
292 assert!(!stat.has_same_dtype_as_array());
293 }
294 }
295}