vortex_array/arrays/varbin/
stats.rs1use std::cmp::Ordering;
2
3use vortex_error::{VortexResult, vortex_panic};
4
5use crate::Array;
6use crate::accessor::ArrayAccessor;
7use crate::arrays::VarBinEncoding;
8use crate::arrays::varbin::VarBinArray;
9use crate::compute::scalar_at;
10use crate::nbytes::NBytes;
11use crate::stats::{Precision, Stat, StatsSet};
12use crate::vtable::StatisticsVTable;
13
14impl StatisticsVTable<&VarBinArray> for VarBinEncoding {
15 fn compute_statistics(&self, array: &VarBinArray, stat: Stat) -> VortexResult<StatsSet> {
16 compute_varbin_statistics(array, stat)
17 }
18}
19
20pub fn compute_varbin_statistics<T: ArrayAccessor<[u8]> + Array>(
21 array: &T,
22 stat: Stat,
23) -> VortexResult<StatsSet> {
24 if array.is_empty() {
25 return Ok(StatsSet::empty_array());
26 }
27
28 Ok(match stat {
29 Stat::NullCount => {
30 let null_count = array.validity_mask()?.false_count();
31 if null_count == array.len() {
32 return Ok(StatsSet::nulls(array.len()));
33 }
34
35 let mut stats = StatsSet::of(Stat::NullCount, Precision::exact(null_count));
36 if null_count > 0 {
37 stats.set(Stat::IsConstant, Precision::exact(false));
39 }
40 stats
41 }
42 Stat::IsConstant => {
43 let is_constant = array.with_iterator(compute_is_constant)?;
44 if is_constant {
45 StatsSet::constant(scalar_at(array, 0)?, array.len())
47 } else {
48 StatsSet::of(Stat::IsConstant, Precision::exact(is_constant))
49 }
50 }
51 Stat::IsSorted => {
52 let is_sorted = array.with_iterator(|iter| iter.flatten().is_sorted())?;
53 let mut stats = StatsSet::of(Stat::IsSorted, Precision::exact(is_sorted));
54 if !is_sorted {
55 stats.set(Stat::IsStrictSorted, Precision::exact(false));
56 }
57 stats
58 }
59 Stat::IsStrictSorted => {
60 let is_strict_sorted = array.with_iterator(|iter| {
61 iter.flatten()
62 .is_sorted_by(|a, b| matches!(a.cmp(b), Ordering::Less))
63 })?;
64 let mut stats = StatsSet::of(Stat::IsStrictSorted, Precision::exact(is_strict_sorted));
65 if is_strict_sorted {
66 stats.set(Stat::IsSorted, Precision::exact(true));
67 }
68 stats
69 }
70 Stat::UncompressedSizeInBytes => StatsSet::of(stat, Precision::exact(array.nbytes())),
71 Stat::Min | Stat::Max => {
72 vortex_panic!(
74 "Unreachable, stat {} should have already been handled",
75 stat
76 )
77 }
78 Stat::Sum => unreachable!("Sum is not supported for VarBinArray"),
79 })
80}
81
82pub(super) fn compute_is_constant(iter: &mut dyn Iterator<Item = Option<&[u8]>>) -> bool {
83 let Some(first_value) = iter.next() else {
84 return true; };
86 for v in iter {
87 if v != first_value {
88 return false;
89 }
90 }
91 true
92}
93
94#[cfg(test)]
95mod test {
96 use std::ops::Deref;
97
98 use vortex_buffer::{BufferString, ByteBuffer};
99 use vortex_dtype::{DType, Nullability};
100
101 use crate::array::Array;
102 use crate::arrays::varbin::VarBinArray;
103
104 fn array(dtype: DType) -> VarBinArray {
105 VarBinArray::from_vec(
106 vec!["hello world", "hello world this is a long string"],
107 dtype,
108 )
109 }
110
111 #[test]
112 fn utf8_stats() {
113 let arr = array(DType::Utf8(Nullability::NonNullable));
114 assert_eq!(
115 arr.statistics().compute_min::<BufferString>().unwrap(),
116 BufferString::from("hello world".to_string())
117 );
118 assert_eq!(
119 arr.statistics().compute_max::<BufferString>().unwrap(),
120 BufferString::from("hello world this is a long string".to_string())
121 );
122 assert!(!arr.statistics().compute_is_constant().unwrap());
123 assert!(arr.statistics().compute_is_sorted().unwrap());
124 }
125
126 #[test]
127 fn binary_stats() {
128 let arr = array(DType::Binary(Nullability::NonNullable));
129 assert_eq!(
130 arr.statistics()
131 .compute_min::<ByteBuffer>()
132 .unwrap()
133 .deref(),
134 b"hello world"
135 );
136 assert_eq!(
137 arr.statistics()
138 .compute_max::<ByteBuffer>()
139 .unwrap()
140 .deref(),
141 "hello world this is a long string".as_bytes()
142 );
143 assert!(!arr.statistics().compute_is_constant().unwrap());
144 assert!(arr.statistics().compute_is_sorted().unwrap());
145 }
146}