vortex_array/stats/
array.rs

1//! Stats as they are stored on arrays.
2
3use std::sync::Arc;
4
5use parking_lot::RwLock;
6use vortex_error::{VortexError, VortexResult, vortex_panic};
7use vortex_scalar::ScalarValue;
8
9use super::{
10    Precision, Stat, StatType, StatsProvider, StatsProviderExt, StatsSet, StatsSetIntoIter,
11};
12use crate::Array;
13use crate::compute::{
14    MinMaxResult, is_constant, is_sorted, is_strict_sorted, min_max, nan_count, sum,
15};
16
17/// A shared [`StatsSet`] stored in an array. Can be shared by copies of the array and can also be mutated in place.
18// TODO(adamg): This is a very bad name.
19#[derive(Clone, Default, Debug)]
20pub struct ArrayStats {
21    inner: Arc<RwLock<StatsSet>>,
22}
23
24/// Reference to an array's [`StatsSet`]. Can be used to get and mutate the underlying stats.
25///
26/// Constructed by calling [`ArrayStats::to_ref`].
27pub struct StatsSetRef<'a> {
28    // We need to reference back to the array
29    dyn_array_ref: &'a dyn Array,
30    parent_stats: ArrayStats,
31}
32
33impl ArrayStats {
34    pub fn to_ref<'a>(&self, array: &'a dyn Array) -> StatsSetRef<'a> {
35        StatsSetRef {
36            dyn_array_ref: array,
37            parent_stats: self.clone(),
38        }
39    }
40
41    pub fn set(&self, stat: Stat, value: Precision<ScalarValue>) {
42        self.inner.write().set(stat, value);
43    }
44
45    pub fn clear(&self, stat: Stat) {
46        self.inner.write().clear(stat);
47    }
48
49    pub fn retain(&self, stats: &[Stat]) {
50        self.inner.write().retain_only(stats);
51    }
52}
53
54impl From<StatsSet> for ArrayStats {
55    fn from(value: StatsSet) -> Self {
56        Self {
57            inner: Arc::new(RwLock::new(value)),
58        }
59    }
60}
61
62impl From<ArrayStats> for StatsSet {
63    fn from(value: ArrayStats) -> Self {
64        value.inner.read().clone()
65    }
66}
67
68impl StatsProvider for ArrayStats {
69    fn get(&self, stat: Stat) -> Option<Precision<ScalarValue>> {
70        let guard = self.inner.read();
71        guard.get(stat)
72    }
73
74    fn len(&self) -> usize {
75        let guard = self.inner.read();
76        guard.len()
77    }
78}
79
80impl StatsSetRef<'_> {
81    pub fn set_iter(&self, iter: StatsSetIntoIter) {
82        let mut guard = self.parent_stats.inner.write();
83
84        for (stat, value) in iter {
85            guard.set(stat, value);
86        }
87    }
88
89    pub fn inherit(&self, parent_stats: StatsSetRef<'_>) {
90        // TODO(ngates): depending on statistic, this should choose the more precise one
91        self.set_iter(parent_stats.into_iter());
92    }
93
94    // TODO(adamg): potentially problematic name
95    pub fn to_owned(&self) -> StatsSet {
96        self.parent_stats.inner.read().clone()
97    }
98
99    pub fn into_iter(&self) -> StatsSetIntoIter {
100        self.to_owned().into_iter()
101    }
102
103    pub fn compute_stat(&self, stat: Stat) -> VortexResult<Option<ScalarValue>> {
104        // If it's already computed and exact, we can return it.
105        if let Some(Precision::Exact(stat)) = self.get(stat) {
106            return Ok(Some(stat));
107        }
108
109        Ok(match stat {
110            Stat::Min => {
111                min_max(self.dyn_array_ref)?.map(|MinMaxResult { min, max: _ }| min.into_value())
112            }
113            Stat::Max => {
114                min_max(self.dyn_array_ref)?.map(|MinMaxResult { min: _, max }| max.into_value())
115            }
116            Stat::Sum => {
117                Stat::Sum
118                    .dtype(self.dyn_array_ref.dtype())
119                    .is_some()
120                    .then(|| {
121                        // Sum is supported for this dtype.
122                        sum(self.dyn_array_ref)
123                    })
124                    .transpose()?
125                    .map(|s| s.into_value())
126            }
127            Stat::NullCount => Some(self.dyn_array_ref.invalid_count()?.into()),
128            Stat::IsConstant => {
129                if self.dyn_array_ref.is_empty() {
130                    None
131                } else {
132                    is_constant(self.dyn_array_ref)?.map(ScalarValue::from)
133                }
134            }
135            Stat::IsSorted => Some(is_sorted(self.dyn_array_ref)?.into()),
136            Stat::IsStrictSorted => Some(is_strict_sorted(self.dyn_array_ref)?.into()),
137            Stat::UncompressedSizeInBytes => {
138                let nbytes: ScalarValue =
139                    (self.dyn_array_ref.to_canonical()?.as_ref().nbytes() as u64).into();
140                self.set(stat, Precision::exact(nbytes.clone()));
141                Some(nbytes)
142            }
143            Stat::NaNCount => {
144                Stat::NaNCount
145                    .dtype(self.dyn_array_ref.dtype())
146                    .is_some()
147                    .then(|| {
148                        // NaNCount is supported for this dtype.
149                        nan_count(self.dyn_array_ref)
150                    })
151                    .transpose()?
152                    .map(|s| s.into())
153            }
154        })
155    }
156
157    pub fn compute_all(&self, stats: &[Stat]) -> VortexResult<StatsSet> {
158        let mut stats_set = StatsSet::default();
159        for &stat in stats {
160            if let Some(s) = self.compute_stat(stat)? {
161                stats_set.set(stat, Precision::exact(s))
162            }
163        }
164        Ok(stats_set)
165    }
166}
167
168impl StatsSetRef<'_> {
169    pub fn get_as<U: for<'a> TryFrom<&'a ScalarValue, Error = VortexError>>(
170        &self,
171        stat: Stat,
172    ) -> Option<Precision<U>> {
173        StatsProviderExt::get_as::<U>(self, stat)
174    }
175
176    pub fn get_as_bound<S, U>(&self) -> Option<S::Bound>
177    where
178        S: StatType<U>,
179        U: for<'a> TryFrom<&'a ScalarValue, Error = VortexError>,
180    {
181        StatsProviderExt::get_as_bound::<S, U>(self)
182    }
183
184    pub fn compute_as<U: for<'a> TryFrom<&'a ScalarValue, Error = VortexError>>(
185        &self,
186        stat: Stat,
187    ) -> Option<U> {
188        self.compute_stat(stat)
189            .inspect_err(|e| log::warn!("Failed to compute stat {stat}: {e}"))
190            .ok()
191            .flatten()
192            .map(|s| U::try_from(&s))
193            .transpose()
194            .unwrap_or_else(|err| {
195                vortex_panic!(
196                    err,
197                    "Failed to compute stat {} as {}",
198                    stat,
199                    std::any::type_name::<U>()
200                )
201            })
202    }
203
204    pub fn set(&self, stat: Stat, value: Precision<ScalarValue>) {
205        self.parent_stats.set(stat, value);
206    }
207
208    pub fn clear(&self, stat: Stat) {
209        self.parent_stats.clear(stat);
210    }
211
212    pub fn retain(&self, stats: &[Stat]) {
213        self.parent_stats.retain(stats);
214    }
215
216    pub fn compute_min<U: for<'a> TryFrom<&'a ScalarValue, Error = VortexError>>(
217        &self,
218    ) -> Option<U> {
219        self.compute_as(Stat::Min)
220    }
221
222    pub fn compute_max<U: for<'a> TryFrom<&'a ScalarValue, Error = VortexError>>(
223        &self,
224    ) -> Option<U> {
225        self.compute_as(Stat::Max)
226    }
227
228    pub fn compute_is_sorted(&self) -> Option<bool> {
229        self.compute_as(Stat::IsSorted)
230    }
231
232    pub fn compute_is_strict_sorted(&self) -> Option<bool> {
233        self.compute_as(Stat::IsStrictSorted)
234    }
235
236    pub fn compute_is_constant(&self) -> Option<bool> {
237        self.compute_as(Stat::IsConstant)
238    }
239
240    pub fn compute_null_count(&self) -> Option<usize> {
241        self.compute_as(Stat::NullCount)
242    }
243
244    pub fn compute_uncompressed_size_in_bytes(&self) -> Option<usize> {
245        self.compute_as(Stat::UncompressedSizeInBytes)
246    }
247}
248
249impl StatsProvider for StatsSetRef<'_> {
250    fn get(&self, stat: Stat) -> Option<Precision<ScalarValue>> {
251        self.parent_stats.get(stat)
252    }
253
254    fn len(&self) -> usize {
255        self.parent_stats.len()
256    }
257}