Skip to main content

vortex_array/stats/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Stats as they are stored on arrays.
5
6use std::sync::Arc;
7
8use parking_lot::RwLock;
9use vortex_error::VortexError;
10use vortex_error::VortexResult;
11use vortex_error::vortex_panic;
12
13use super::MutTypedStatsSetRef;
14use super::StatsSet;
15use super::StatsSetIntoIter;
16use super::TypedStatsSetRef;
17use crate::Array;
18use crate::builders::builder_with_capacity;
19use crate::compute::MinMaxResult;
20use crate::compute::is_constant;
21use crate::compute::is_sorted;
22use crate::compute::is_strict_sorted;
23use crate::compute::min_max;
24use crate::compute::nan_count;
25use crate::compute::sum;
26use crate::expr::stats::Precision;
27use crate::expr::stats::Stat;
28use crate::expr::stats::StatsProvider;
29use crate::scalar::Scalar;
30use crate::scalar::ScalarValue;
31
32/// A shared [`StatsSet`] stored in an array. Can be shared by copies of the array and can also be mutated in place.
33// TODO(adamg): This is a very bad name.
34#[derive(Clone, Default, Debug)]
35pub struct ArrayStats {
36    inner: Arc<RwLock<StatsSet>>,
37}
38
39/// Reference to an array's [`StatsSet`]. Can be used to get and mutate the underlying stats.
40///
41/// Constructed by calling [`ArrayStats::to_ref`].
42pub struct StatsSetRef<'a> {
43    // We need to reference back to the array
44    dyn_array_ref: &'a dyn Array,
45    array_stats: &'a ArrayStats,
46}
47
48impl ArrayStats {
49    pub fn to_ref<'a>(&'a self, array: &'a dyn Array) -> StatsSetRef<'a> {
50        StatsSetRef {
51            dyn_array_ref: array,
52            array_stats: self,
53        }
54    }
55
56    pub fn set(&self, stat: Stat, value: Precision<ScalarValue>) {
57        self.inner.write().set(stat, value);
58    }
59
60    pub fn clear(&self, stat: Stat) {
61        self.inner.write().clear(stat);
62    }
63
64    pub fn retain(&self, stats: &[Stat]) {
65        self.inner.write().retain_only(stats);
66    }
67}
68
69impl From<StatsSet> for ArrayStats {
70    fn from(value: StatsSet) -> Self {
71        Self {
72            inner: Arc::new(RwLock::new(value)),
73        }
74    }
75}
76
77impl From<ArrayStats> for StatsSet {
78    fn from(value: ArrayStats) -> Self {
79        value.inner.read().clone()
80    }
81}
82
83impl StatsSetRef<'_> {
84    pub fn set_iter(&self, iter: StatsSetIntoIter) {
85        let mut guard = self.array_stats.inner.write();
86        for (stat, value) in iter {
87            guard.set(stat, value);
88        }
89    }
90
91    pub fn inherit_from(&self, stats: StatsSetRef<'_>) {
92        // Only inherit if the underlying stats are different
93        if !Arc::ptr_eq(&self.array_stats.inner, &stats.array_stats.inner) {
94            stats.with_iter(|iter| self.inherit(iter));
95        }
96    }
97
98    pub fn inherit<'a>(&self, iter: impl Iterator<Item = &'a (Stat, Precision<ScalarValue>)>) {
99        let mut guard = self.array_stats.inner.write();
100        for (stat, value) in iter {
101            if !value.is_exact() {
102                if !guard.get(*stat).is_some_and(|v| v.is_exact()) {
103                    guard.set(*stat, value.clone());
104                }
105            } else {
106                guard.set(*stat, value.clone());
107            }
108        }
109    }
110
111    pub fn with_typed_stats_set<U, F: FnOnce(TypedStatsSetRef) -> U>(&self, apply: F) -> U {
112        apply(
113            self.array_stats
114                .inner
115                .read()
116                .as_typed_ref(self.dyn_array_ref.dtype()),
117        )
118    }
119
120    pub fn with_mut_typed_stats_set<U, F: FnOnce(MutTypedStatsSetRef) -> U>(&self, apply: F) -> U {
121        apply(
122            self.array_stats
123                .inner
124                .write()
125                .as_mut_typed_ref(self.dyn_array_ref.dtype()),
126        )
127    }
128
129    pub fn to_owned(&self) -> StatsSet {
130        self.array_stats.inner.read().clone()
131    }
132
133    pub fn with_iter<
134        F: for<'a> FnOnce(&mut dyn Iterator<Item = &'a (Stat, Precision<ScalarValue>)>) -> R,
135        R,
136    >(
137        &self,
138        f: F,
139    ) -> R {
140        let lock = self.array_stats.inner.read();
141        f(&mut lock.iter())
142    }
143
144    pub fn compute_stat(&self, stat: Stat) -> VortexResult<Option<Scalar>> {
145        // If it's already computed and exact, we can return it.
146        if let Some(Precision::Exact(s)) = self.get(stat) {
147            return Ok(Some(s));
148        }
149
150        let array_ref = self.dyn_array_ref.to_array();
151        Ok(match stat {
152            Stat::Min => min_max(&array_ref)?.map(|MinMaxResult { min, max: _ }| min),
153            Stat::Max => min_max(&array_ref)?.map(|MinMaxResult { min: _, max }| max),
154            Stat::Sum => {
155                Stat::Sum
156                    .dtype(self.dyn_array_ref.dtype())
157                    .is_some()
158                    .then(|| {
159                        // Sum is supported for this dtype.
160                        sum(&array_ref)
161                    })
162                    .transpose()?
163            }
164            Stat::NullCount => self.dyn_array_ref.invalid_count().ok().map(Into::into),
165            Stat::IsConstant => {
166                if self.dyn_array_ref.is_empty() {
167                    None
168                } else {
169                    is_constant(&array_ref)?.map(|v| v.into())
170                }
171            }
172            Stat::IsSorted => is_sorted(&array_ref)?.map(|v| v.into()),
173            Stat::IsStrictSorted => is_strict_sorted(&array_ref)?.map(|v| v.into()),
174            Stat::UncompressedSizeInBytes => {
175                let mut builder =
176                    builder_with_capacity(self.dyn_array_ref.dtype(), self.dyn_array_ref.len());
177                unsafe {
178                    builder.extend_from_array_unchecked(&array_ref);
179                }
180                let nbytes = builder.finish().nbytes();
181                self.set(stat, Precision::exact(nbytes));
182                Some(nbytes.into())
183            }
184            Stat::NaNCount => {
185                Stat::NaNCount
186                    .dtype(self.dyn_array_ref.dtype())
187                    .is_some()
188                    .then(|| {
189                        // NaNCount is supported for this dtype.
190                        nan_count(&array_ref)
191                    })
192                    .transpose()?
193                    .map(|s| s.into())
194            }
195        })
196    }
197
198    pub fn compute_all(&self, stats: &[Stat]) -> VortexResult<StatsSet> {
199        let mut stats_set = StatsSet::default();
200        for &stat in stats {
201            if let Some(s) = self.compute_stat(stat)?
202                && let Some(value) = s.into_value()
203            {
204                stats_set.set(stat, Precision::exact(value));
205            }
206        }
207        Ok(stats_set)
208    }
209}
210
211impl StatsSetRef<'_> {
212    pub fn compute_as<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(
213        &self,
214        stat: Stat,
215    ) -> Option<U> {
216        self.compute_stat(stat)
217            .inspect_err(|e| tracing::warn!("Failed to compute stat {stat}: {e}"))
218            .ok()
219            .flatten()
220            .map(|s| U::try_from(&s))
221            .transpose()
222            .unwrap_or_else(|err| {
223                vortex_panic!(
224                    err,
225                    "Failed to compute stat {} as {}",
226                    stat,
227                    std::any::type_name::<U>()
228                )
229            })
230    }
231
232    pub fn set(&self, stat: Stat, value: Precision<ScalarValue>) {
233        self.array_stats.set(stat, value);
234    }
235
236    pub fn clear(&self, stat: Stat) {
237        self.array_stats.clear(stat);
238    }
239
240    pub fn compute_min<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(&self) -> Option<U> {
241        self.compute_as(Stat::Min)
242    }
243
244    pub fn compute_max<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(&self) -> Option<U> {
245        self.compute_as(Stat::Max)
246    }
247
248    pub fn compute_is_sorted(&self) -> Option<bool> {
249        self.compute_as(Stat::IsSorted)
250    }
251
252    pub fn compute_is_strict_sorted(&self) -> Option<bool> {
253        self.compute_as(Stat::IsStrictSorted)
254    }
255
256    pub fn compute_is_constant(&self) -> Option<bool> {
257        self.compute_as(Stat::IsConstant)
258    }
259
260    pub fn compute_null_count(&self) -> Option<usize> {
261        self.compute_as(Stat::NullCount)
262    }
263
264    pub fn compute_uncompressed_size_in_bytes(&self) -> Option<usize> {
265        self.compute_as(Stat::UncompressedSizeInBytes)
266    }
267}
268
269impl StatsProvider for StatsSetRef<'_> {
270    fn get(&self, stat: Stat) -> Option<Precision<Scalar>> {
271        self.array_stats
272            .inner
273            .read()
274            .as_typed_ref(self.dyn_array_ref.dtype())
275            .get(stat)
276    }
277
278    fn len(&self) -> usize {
279        self.array_stats.inner.read().len()
280    }
281}