vortex_array/stats/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Stats as they are stored on arrays.
5
6use std::sync::Arc;
7
8use parking_lot::RwLock;
9use vortex_error::{VortexError, VortexResult, vortex_panic};
10use vortex_scalar::{Scalar, ScalarValue};
11
12use super::{Precision, Stat, StatsProvider, StatsSet, StatsSetIntoIter, TypedStatsSetRef};
13use crate::Array;
14use crate::builders::builder_with_capacity;
15use crate::compute::{
16    MinMaxResult, is_constant, is_sorted, is_strict_sorted, min_max, nan_count, sum,
17};
18
19/// A shared [`StatsSet`] stored in an array. Can be shared by copies of the array and can also be mutated in place.
20// TODO(adamg): This is a very bad name.
21#[derive(Clone, Default, Debug)]
22pub struct ArrayStats {
23    inner: Arc<RwLock<StatsSet>>,
24}
25
26/// Reference to an array's [`StatsSet`]. Can be used to get and mutate the underlying stats.
27///
28/// Constructed by calling [`ArrayStats::to_ref`].
29pub struct StatsSetRef<'a> {
30    // We need to reference back to the array
31    dyn_array_ref: &'a dyn Array,
32    array_stats: &'a ArrayStats,
33}
34
35impl ArrayStats {
36    pub fn to_ref<'a>(&'a self, array: &'a dyn Array) -> StatsSetRef<'a> {
37        StatsSetRef {
38            dyn_array_ref: array,
39            array_stats: self,
40        }
41    }
42
43    pub fn set(&self, stat: Stat, value: Precision<ScalarValue>) {
44        self.inner.write().set(stat, value);
45    }
46
47    pub fn clear(&self, stat: Stat) {
48        self.inner.write().clear(stat);
49    }
50
51    pub fn retain(&self, stats: &[Stat]) {
52        self.inner.write().retain_only(stats);
53    }
54}
55
56impl From<StatsSet> for ArrayStats {
57    fn from(value: StatsSet) -> Self {
58        Self {
59            inner: Arc::new(RwLock::new(value)),
60        }
61    }
62}
63
64impl From<ArrayStats> for StatsSet {
65    fn from(value: ArrayStats) -> Self {
66        value.inner.read().clone()
67    }
68}
69
70impl StatsSetRef<'_> {
71    pub fn set_iter(&self, iter: StatsSetIntoIter) {
72        let mut guard = self.array_stats.inner.write();
73        for (stat, value) in iter {
74            guard.set(stat, value);
75        }
76    }
77
78    pub fn inherit_from(&self, stats: StatsSetRef<'_>) {
79        // Only inherit if the underlying stats are different
80        if !Arc::ptr_eq(&self.array_stats.inner, &stats.array_stats.inner) {
81            stats.with_iter(|iter| self.inherit(iter));
82        }
83    }
84
85    pub fn inherit<'a>(&self, iter: impl Iterator<Item = &'a (Stat, Precision<ScalarValue>)>) {
86        let mut guard = self.array_stats.inner.write();
87        for (stat, value) in iter {
88            if !value.is_exact() {
89                if !guard.get(*stat).is_some_and(|v| v.is_exact()) {
90                    guard.set(*stat, value.clone());
91                }
92            } else {
93                guard.set(*stat, value.clone());
94            }
95        }
96    }
97
98    pub fn with_typed_stats_set<U, F: FnOnce(TypedStatsSetRef) -> U>(&self, apply: F) -> U {
99        apply(
100            self.array_stats
101                .inner
102                .read()
103                .as_typed_ref(self.dyn_array_ref.dtype()),
104        )
105    }
106
107    pub fn to_owned(&self) -> StatsSet {
108        self.array_stats.inner.read().clone()
109    }
110
111    pub fn with_iter<
112        F: for<'a> FnOnce(&mut dyn Iterator<Item = &'a (Stat, Precision<ScalarValue>)>) -> R,
113        R,
114    >(
115        &self,
116        f: F,
117    ) -> R {
118        let lock = self.array_stats.inner.read();
119        f(&mut lock.iter())
120    }
121
122    pub fn compute_stat(&self, stat: Stat) -> VortexResult<Option<Scalar>> {
123        // If it's already computed and exact, we can return it.
124        if let Some(Precision::Exact(s)) = self.get(stat) {
125            return Ok(Some(s));
126        }
127
128        Ok(match stat {
129            Stat::Min => min_max(self.dyn_array_ref)?.map(|MinMaxResult { min, max: _ }| min),
130            Stat::Max => min_max(self.dyn_array_ref)?.map(|MinMaxResult { min: _, max }| max),
131            Stat::Sum => {
132                Stat::Sum
133                    .dtype(self.dyn_array_ref.dtype())
134                    .is_some()
135                    .then(|| {
136                        // Sum is supported for this dtype.
137                        sum(self.dyn_array_ref)
138                    })
139                    .transpose()?
140            }
141            Stat::NullCount => Some(self.dyn_array_ref.invalid_count().into()),
142            Stat::IsConstant => {
143                if self.dyn_array_ref.is_empty() {
144                    None
145                } else {
146                    is_constant(self.dyn_array_ref)?.map(|v| v.into())
147                }
148            }
149            Stat::IsSorted => is_sorted(self.dyn_array_ref)?.map(|v| v.into()),
150            Stat::IsStrictSorted => is_strict_sorted(self.dyn_array_ref)?.map(|v| v.into()),
151            Stat::UncompressedSizeInBytes => {
152                let mut builder =
153                    builder_with_capacity(self.dyn_array_ref.dtype(), self.dyn_array_ref.len());
154                unsafe {
155                    builder.extend_from_array_unchecked(self.dyn_array_ref);
156                }
157                let nbytes = builder.finish().nbytes();
158                self.set(stat, Precision::exact(nbytes));
159                Some(nbytes.into())
160            }
161            Stat::NaNCount => {
162                Stat::NaNCount
163                    .dtype(self.dyn_array_ref.dtype())
164                    .is_some()
165                    .then(|| {
166                        // NaNCount is supported for this dtype.
167                        nan_count(self.dyn_array_ref)
168                    })
169                    .transpose()?
170                    .map(|s| s.into())
171            }
172        })
173    }
174
175    pub fn compute_all(&self, stats: &[Stat]) -> VortexResult<StatsSet> {
176        let mut stats_set = StatsSet::default();
177        for &stat in stats {
178            if let Some(s) = self.compute_stat(stat)? {
179                stats_set.set(stat, Precision::exact(s.into_value()))
180            }
181        }
182        Ok(stats_set)
183    }
184}
185
186impl StatsSetRef<'_> {
187    pub fn compute_as<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(
188        &self,
189        stat: Stat,
190    ) -> Option<U> {
191        self.compute_stat(stat)
192            .inspect_err(|e| log::warn!("Failed to compute stat {stat}: {e}"))
193            .ok()
194            .flatten()
195            .map(|s| U::try_from(&s))
196            .transpose()
197            .unwrap_or_else(|err| {
198                vortex_panic!(
199                    err,
200                    "Failed to compute stat {} as {}",
201                    stat,
202                    std::any::type_name::<U>()
203                )
204            })
205    }
206
207    pub fn set(&self, stat: Stat, value: Precision<ScalarValue>) {
208        self.array_stats.set(stat, value);
209    }
210
211    pub fn clear(&self, stat: Stat) {
212        self.array_stats.clear(stat);
213    }
214
215    pub fn retain(&self, stats: &[Stat]) {
216        self.array_stats.retain(stats);
217    }
218
219    pub fn compute_min<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(&self) -> Option<U> {
220        self.compute_as(Stat::Min)
221    }
222
223    pub fn compute_max<U: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(&self) -> Option<U> {
224        self.compute_as(Stat::Max)
225    }
226
227    pub fn compute_is_sorted(&self) -> Option<bool> {
228        self.compute_as(Stat::IsSorted)
229    }
230
231    pub fn compute_is_strict_sorted(&self) -> Option<bool> {
232        self.compute_as(Stat::IsStrictSorted)
233    }
234
235    pub fn compute_is_constant(&self) -> Option<bool> {
236        self.compute_as(Stat::IsConstant)
237    }
238
239    pub fn compute_null_count(&self) -> Option<usize> {
240        self.compute_as(Stat::NullCount)
241    }
242
243    pub fn compute_uncompressed_size_in_bytes(&self) -> Option<usize> {
244        self.compute_as(Stat::UncompressedSizeInBytes)
245    }
246}
247
248impl StatsProvider for StatsSetRef<'_> {
249    fn get(&self, stat: Stat) -> Option<Precision<Scalar>> {
250        self.array_stats
251            .inner
252            .read()
253            .as_typed_ref(self.dyn_array_ref.dtype())
254            .get(stat)
255    }
256
257    fn len(&self) -> usize {
258        self.array_stats.inner.read().len()
259    }
260}