Skip to main content

vortex_array/stats/
stats_set.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5
6use enum_iterator::all;
7use num_traits::CheckedAdd;
8use smallvec::SmallVec;
9use smallvec::smallvec;
10use vortex_error::VortexError;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_err;
14use vortex_error::vortex_panic;
15
16use crate::dtype::DType;
17use crate::expr::stats::IsConstant;
18use crate::expr::stats::IsSorted;
19use crate::expr::stats::IsStrictSorted;
20use crate::expr::stats::Max;
21use crate::expr::stats::Min;
22use crate::expr::stats::NaNCount;
23use crate::expr::stats::NullCount;
24use crate::expr::stats::Precision;
25use crate::expr::stats::Stat;
26use crate::expr::stats::StatBound;
27use crate::expr::stats::StatType;
28use crate::expr::stats::StatsProvider;
29use crate::expr::stats::StatsProviderExt;
30use crate::expr::stats::Sum;
31use crate::expr::stats::UncompressedSizeInBytes;
32use crate::scalar::Scalar;
33use crate::scalar::ScalarValue;
34
35/// Type of the SmallVec stored inside StatsSet
36pub type StatsArray = [(Stat, Precision<ScalarValue>); 4];
37
38#[derive(Default, Debug, Clone)]
39pub struct StatsSet {
40    values: SmallVec<StatsArray>,
41}
42
43impl StatsSet {
44    /// Create new StatSet without validating uniqueness of all the entries
45    ///
46    /// # Safety
47    ///
48    /// This method will not panic or trigger UB, but may lead to duplicate stats being stored.
49    pub unsafe fn new_unchecked(values: SmallVec<StatsArray>) -> Self {
50        Self { values }
51    }
52
53    /// Create StatsSet from single stat and value
54    pub fn of(stat: Stat, value: Precision<ScalarValue>) -> Self {
55        Self {
56            values: smallvec![(stat, value)],
57        }
58    }
59
60    /// Wrap stats set with a dtype for mutable typed scalar access
61    pub fn as_mut_typed_ref<'a, 'b>(&'a mut self, dtype: &'b DType) -> MutTypedStatsSetRef<'a, 'b> {
62        MutTypedStatsSetRef {
63            values: self,
64            dtype,
65        }
66    }
67
68    /// Wrap stats set with a dtype for typed scalar access
69    pub fn as_typed_ref<'a, 'b>(&'a self, dtype: &'b DType) -> TypedStatsSetRef<'a, 'b> {
70        TypedStatsSetRef {
71            values: self,
72            dtype,
73        }
74    }
75}
76
77// Getters and setters for individual stats.
78impl StatsSet {
79    /// Set the stat `stat` to `value`.
80    pub fn set(&mut self, stat: Stat, value: Precision<ScalarValue>) {
81        if let Some(existing) = self.values.iter_mut().find(|(s, _)| *s == stat) {
82            *existing = (stat, value);
83        } else {
84            self.values.push((stat, value));
85        }
86    }
87
88    /// Clear the stat `stat` from the set.
89    pub fn clear(&mut self, stat: Stat) {
90        self.values.retain(|(s, _)| *s != stat);
91    }
92
93    /// Only keep given stats
94    pub fn retain_only(&mut self, stats: &[Stat]) {
95        self.values.retain(|(s, _)| stats.contains(s));
96    }
97
98    /// Iterate over the statistic names and values in-place.
99    ///
100    /// See [Iterator].
101    pub fn iter(&self) -> impl Iterator<Item = &(Stat, Precision<ScalarValue>)> {
102        self.values.iter()
103    }
104
105    /// Get value for a given stat
106    pub fn get(&self, stat: Stat) -> Precision<ScalarValue> {
107        self.values
108            .iter()
109            .find(|(s, _)| *s == stat)
110            .map(|(_, v)| v.clone())
111            .unwrap_or(Precision::Absent)
112    }
113
114    /// Length of the stats set
115    pub fn len(&self) -> usize {
116        self.values.len()
117    }
118
119    /// Check whether the statset is empty
120    pub fn is_empty(&self) -> bool {
121        self.values.is_empty()
122    }
123
124    /// Get scalar value of a given dtype
125    pub fn get_as<T: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(
126        &self,
127        stat: Stat,
128        dtype: &DType,
129    ) -> Precision<T> {
130        self.get(stat).map(|v| {
131            T::try_from(
132                &Scalar::try_new(dtype.clone(), Some(v))
133                    .vortex_expect("failed to construct a scalar statistic"),
134            )
135            .unwrap_or_else(|err| {
136                vortex_panic!(
137                    err,
138                    "Failed to get stat {} as {}",
139                    stat,
140                    std::any::type_name::<T>()
141                )
142            })
143        })
144    }
145}
146
147// StatSetIntoIter just exists to protect current implementation from exposure on the public API.
148
149/// Owned iterator over the stats.
150///
151/// See [IntoIterator].
152pub struct StatsSetIntoIter(smallvec::IntoIter<StatsArray>);
153
154impl Iterator for StatsSetIntoIter {
155    type Item = (Stat, Precision<ScalarValue>);
156
157    fn next(&mut self) -> Option<Self::Item> {
158        self.0.next()
159    }
160}
161
162impl IntoIterator for StatsSet {
163    type Item = (Stat, Precision<ScalarValue>);
164    type IntoIter = StatsSetIntoIter;
165
166    fn into_iter(self) -> Self::IntoIter {
167        StatsSetIntoIter(self.values.into_iter())
168    }
169}
170
171impl FromIterator<(Stat, Precision<ScalarValue>)> for StatsSet {
172    fn from_iter<T: IntoIterator<Item = (Stat, Precision<ScalarValue>)>>(iter: T) -> Self {
173        let iter = iter.into_iter();
174
175        let mut this = Self {
176            values: SmallVec::new(),
177        };
178        this.extend(iter);
179        this
180    }
181}
182
183impl Extend<(Stat, Precision<ScalarValue>)> for StatsSet {
184    #[inline]
185    fn extend<T: IntoIterator<Item = (Stat, Precision<ScalarValue>)>>(&mut self, iter: T) {
186        iter.into_iter()
187            .for_each(|(stat, value)| self.set(stat, value));
188    }
189}
190
191/// Merge helpers
192impl StatsSet {
193    /// Merge stats set `other` into `self`, with the semantic assumption that `other`
194    /// contains stats from a disjoint array that is *appended* to the array represented by `self`.
195    pub fn merge_ordered(mut self, other: &Self, dtype: &DType) -> Self {
196        self.as_mut_typed_ref(dtype)
197            .merge_ordered(&other.as_typed_ref(dtype));
198        self
199    }
200
201    /// Merge stats set `other` into `self`, from a disjoint array, with no ordering assumptions.
202    /// Stats that are not commutative (e.g., is_sorted) are dropped from the result.
203    pub fn merge_unordered(mut self, other: &Self, dtype: &DType) -> Self {
204        self.as_mut_typed_ref(dtype)
205            .merge_unordered(&other.as_typed_ref(dtype));
206        self
207    }
208
209    /// Given two sets of stats (of differing precision) for the same array, combine them
210    pub fn combine_sets(&mut self, other: &Self, dtype: &DType) -> VortexResult<()> {
211        self.as_mut_typed_ref(dtype)
212            .combine_sets(&other.as_typed_ref(dtype))
213    }
214}
215
216pub struct TypedStatsSetRef<'a, 'b> {
217    pub values: &'a StatsSet,
218    pub dtype: &'b DType,
219}
220
221impl StatsProvider for TypedStatsSetRef<'_, '_> {
222    fn get(&self, stat: Stat) -> Precision<Scalar> {
223        self.values.get(stat).map(|sv| {
224            Scalar::try_new(
225                stat.dtype(self.dtype)
226                    .vortex_expect("Must have valid dtype if value is present"),
227                Some(sv),
228            )
229            .vortex_expect("failed to construct a scalar statistic")
230        })
231    }
232
233    fn len(&self) -> usize {
234        self.values.len()
235    }
236}
237
238pub struct MutTypedStatsSetRef<'a, 'b> {
239    pub values: &'a mut StatsSet,
240    pub dtype: &'b DType,
241}
242
243impl MutTypedStatsSetRef<'_, '_> {
244    /// Set the stat `stat` to `value`.
245    pub fn set(&mut self, stat: Stat, value: Precision<ScalarValue>) {
246        self.values.set(stat, value);
247    }
248
249    /// Clear the stat `stat` from the set.
250    pub fn clear(&mut self, stat: Stat) {
251        self.values.clear(stat);
252    }
253}
254
255impl StatsProvider for MutTypedStatsSetRef<'_, '_> {
256    fn get(&self, stat: Stat) -> Precision<Scalar> {
257        self.values.get(stat).map(|sv| {
258            Scalar::try_new(
259                stat.dtype(self.dtype)
260                    .vortex_expect("Must have valid dtype if value is present"),
261                Some(sv),
262            )
263            .vortex_expect("failed to construct a scalar statistic")
264        })
265    }
266
267    fn len(&self) -> usize {
268        self.values.len()
269    }
270}
271
272// Merge helpers
273impl MutTypedStatsSetRef<'_, '_> {
274    /// Merge stats set `other` into `self`, with the semantic assumption that `other`
275    /// contains stats from a disjoint array that is *appended* to the array represented by `self`.
276    pub fn merge_ordered(mut self, other: &TypedStatsSetRef) -> Self {
277        for s in all::<Stat>() {
278            match s {
279                Stat::IsConstant => self.merge_is_constant(other),
280                Stat::IsSorted => self.merge_is_sorted(other),
281                Stat::IsStrictSorted => self.merge_is_strict_sorted(other),
282                Stat::Max => self.merge_max(other),
283                Stat::Min => self.merge_min(other),
284                Stat::Sum => self.merge_sum(other),
285                Stat::NullCount => self.merge_null_count(other),
286                Stat::UncompressedSizeInBytes => self.merge_uncompressed_size_in_bytes(other),
287                Stat::NaNCount => self.merge_nan_count(other),
288            }
289        }
290
291        self
292    }
293
294    /// Merge stats set `other` into `self`, from a disjoint array, with no ordering assumptions.
295    /// Stats that are not commutative (e.g., is_sorted) are dropped from the result.
296    pub fn merge_unordered(mut self, other: &TypedStatsSetRef) -> Self {
297        for s in all::<Stat>() {
298            if !s.is_commutative() {
299                self.clear(s);
300                continue;
301            }
302
303            match s {
304                Stat::IsConstant => self.merge_is_constant(other),
305                Stat::Max => self.merge_max(other),
306                Stat::Min => self.merge_min(other),
307                Stat::Sum => self.merge_sum(other),
308                Stat::NullCount => self.merge_null_count(other),
309                Stat::UncompressedSizeInBytes => self.merge_uncompressed_size_in_bytes(other),
310                Stat::IsSorted | Stat::IsStrictSorted => {
311                    unreachable!("not commutative")
312                }
313                Stat::NaNCount => self.merge_nan_count(other),
314            }
315        }
316
317        self
318    }
319
320    /// Given two sets of stats (of differing precision) for the same array, combine them
321    pub fn combine_sets(&mut self, other: &TypedStatsSetRef) -> VortexResult<()> {
322        let other_stats: Vec<_> = other.values.iter().map(|(stat, _)| *stat).collect();
323        for s in other_stats {
324            match s {
325                Stat::Max => self.combine_bound::<Max>(other)?,
326                Stat::Min => self.combine_bound::<Min>(other)?,
327                Stat::UncompressedSizeInBytes => {
328                    self.combine_bound::<UncompressedSizeInBytes>(other)?
329                }
330                Stat::IsConstant => self.combine_bool_stat::<IsConstant>(other)?,
331                Stat::IsSorted => self.combine_bool_stat::<IsSorted>(other)?,
332                Stat::IsStrictSorted => self.combine_bool_stat::<IsStrictSorted>(other)?,
333                Stat::NullCount => self.combine_bound::<NullCount>(other)?,
334                Stat::Sum => self.combine_bound::<Sum>(other)?,
335                Stat::NaNCount => self.combine_bound::<NaNCount>(other)?,
336            }
337        }
338        Ok(())
339    }
340
341    fn combine_bound<S: StatType<Scalar>>(&mut self, other: &TypedStatsSetRef) -> VortexResult<()>
342    where
343        S::Bound: StatBound<Scalar> + Debug + Eq + PartialEq,
344    {
345        match (self.get_scalar_bound::<S>(), other.get_scalar_bound::<S>()) {
346            (Some(m1), Some(m2)) => {
347                let meet = m1
348                    .intersection(&m2)
349                    .vortex_expect("can always compare scalar")
350                    .ok_or_else(|| {
351                        vortex_err!("{:?} bounds ({m1:?}, {m2:?}) do not overlap", S::STAT)
352                    })?;
353                if meet != m1 {
354                    self.set(
355                        S::STAT,
356                        meet.into_value().map(|s| {
357                            s.into_value()
358                                .vortex_expect("stat scalar value cannot be null")
359                        }),
360                    );
361                }
362            }
363            (None, Some(m)) => self.set(
364                S::STAT,
365                m.into_value().map(|s| {
366                    s.into_value()
367                        .vortex_expect("stat scalar value cannot be null")
368                }),
369            ),
370            (Some(_), _) => (),
371            (None, None) => self.clear(S::STAT),
372        }
373        Ok(())
374    }
375
376    fn combine_bool_stat<S: StatType<bool>>(&mut self, other: &TypedStatsSetRef) -> VortexResult<()>
377    where
378        S::Bound: StatBound<bool> + Debug + Eq + PartialEq,
379    {
380        match (
381            self.get_as_bound::<S, bool>(),
382            other.get_as_bound::<S, bool>(),
383        ) {
384            (Some(m1), Some(m2)) => {
385                let intersection = m1
386                    .intersection(&m2)
387                    .vortex_expect("can always compare boolean")
388                    .ok_or_else(|| {
389                        vortex_err!("{:?} bounds ({m1:?}, {m2:?}) do not overlap", S::STAT)
390                    })?;
391                if intersection != m1 {
392                    self.set(S::STAT, intersection.into_value().map(ScalarValue::from));
393                }
394            }
395            (None, Some(m)) => self.set(S::STAT, m.into_value().map(ScalarValue::from)),
396            (Some(_), None) => (),
397            (None, None) => self.clear(S::STAT),
398        }
399        Ok(())
400    }
401
402    fn merge_min(&mut self, other: &TypedStatsSetRef) {
403        match (
404            self.get_scalar_bound::<Min>(),
405            other.get_scalar_bound::<Min>(),
406        ) {
407            (Some(m1), Some(m2)) => {
408                let meet = m1.union(&m2).vortex_expect("can compare scalar");
409                if meet != m1 {
410                    self.set(
411                        Stat::Min,
412                        meet.into_value().map(|s| {
413                            s.into_value()
414                                .vortex_expect("stat scalar value cannot be null")
415                        }),
416                    );
417                }
418            }
419            _ => self.clear(Stat::Min),
420        }
421    }
422
423    fn merge_max(&mut self, other: &TypedStatsSetRef) {
424        match (
425            self.get_scalar_bound::<Max>(),
426            other.get_scalar_bound::<Max>(),
427        ) {
428            (Some(m1), Some(m2)) => {
429                let meet = m1.union(&m2).vortex_expect("can compare scalar");
430                if meet != m1 {
431                    self.set(
432                        Stat::Max,
433                        meet.into_value().map(|s| {
434                            s.into_value()
435                                .vortex_expect("stat scalar value cannot be null")
436                        }),
437                    );
438                }
439            }
440            _ => self.clear(Stat::Max),
441        }
442    }
443
444    fn merge_sum(&mut self, other: &TypedStatsSetRef) {
445        match (
446            self.get_scalar_bound::<Sum>(),
447            other.get_scalar_bound::<Sum>(),
448        ) {
449            (Some(m1), Some(m2)) => {
450                // If the combine sum is exact, then we can sum them.
451                if let Some(scalar_value) =
452                    m1.zip(m2).as_exact().and_then(|(s1, s2)| match s1.dtype() {
453                        DType::Primitive(..) => s1
454                            .as_primitive()
455                            .checked_add(&s2.as_primitive())
456                            .and_then(|pscalar| pscalar.pvalue().map(ScalarValue::Primitive)),
457                        DType::Decimal(..) => s1
458                            .as_decimal()
459                            .checked_binary_numeric(
460                                &s2.as_decimal(),
461                                crate::scalar::NumericOperator::Add,
462                            )
463                            .map(|scalar| {
464                                ScalarValue::Decimal(
465                                    scalar
466                                        .decimal_value()
467                                        .vortex_expect("no decimal value in scalar"),
468                                )
469                            }),
470                        _ => None,
471                    })
472                {
473                    self.set(Stat::Sum, Precision::Exact(scalar_value));
474                }
475            }
476            _ => self.clear(Stat::Sum),
477        }
478    }
479
480    fn merge_is_constant(&mut self, other: &TypedStatsSetRef) {
481        let self_const = self.get_as(Stat::IsConstant);
482        let other_const = other.get_as(Stat::IsConstant);
483        let self_min = self.get(Stat::Min);
484        let other_min = other.get(Stat::Min);
485
486        if let (Some(self_const), Some(other_const), Some(self_min), Some(other_min)) = (
487            self_const.as_exact(),
488            other_const.as_exact(),
489            self_min.as_exact(),
490            other_min.as_exact(),
491        ) {
492            if self_const && other_const && self_min == other_min {
493                self.set(Stat::IsConstant, Precision::exact(true));
494            } else {
495                self.set(Stat::IsConstant, Precision::inexact(false));
496            }
497        }
498        self.set(Stat::IsConstant, Precision::exact(false));
499    }
500
501    fn merge_is_sorted(&mut self, other: &TypedStatsSetRef) {
502        self.merge_sortedness_stat(other, Stat::IsSorted, PartialOrd::le)
503    }
504
505    fn merge_is_strict_sorted(&mut self, other: &TypedStatsSetRef) {
506        self.merge_sortedness_stat(other, Stat::IsStrictSorted, PartialOrd::lt)
507    }
508
509    fn merge_sortedness_stat<F: Fn(&Scalar, &Scalar) -> bool>(
510        &mut self,
511        other: &TypedStatsSetRef,
512        stat: Stat,
513        cmp: F,
514    ) {
515        if (Precision::Exact(true), Precision::Exact(true))
516            == (self.get_as(stat), other.get_as(stat))
517        {
518            // There might be no stat because it was dropped, or it doesn't exist
519            // (e.g. an all null array).
520            // We assume that it was the dropped case since the doesn't exist might imply sorted,
521            // but this in-precision is correct.
522            if let (Some(self_max), Some(other_min)) = (
523                self.get_scalar_bound::<Max>().and_then(|v| v.max_value()),
524                other.get_scalar_bound::<Min>().and_then(|v| v.min_value()),
525            ) {
526                return if cmp(&self_max, &other_min) {
527                    // keep value
528                } else {
529                    self.set(stat, Precision::inexact(false));
530                };
531            }
532        }
533        self.clear(stat);
534    }
535
536    fn merge_null_count(&mut self, other: &TypedStatsSetRef) {
537        self.merge_sum_stat(Stat::NullCount, other)
538    }
539
540    fn merge_nan_count(&mut self, other: &TypedStatsSetRef) {
541        self.merge_sum_stat(Stat::NaNCount, other)
542    }
543
544    fn merge_uncompressed_size_in_bytes(&mut self, other: &TypedStatsSetRef) {
545        self.merge_sum_stat(Stat::UncompressedSizeInBytes, other)
546    }
547
548    fn merge_sum_stat(&mut self, stat: Stat, other: &TypedStatsSetRef) {
549        let merged = self
550            .get_as::<usize>(stat)
551            .zip(other.get_as::<usize>(stat))
552            .map(|(l, r)| ScalarValue::from(l + r));
553
554        if merged.is_absent() {
555            self.clear(stat);
556        } else {
557            self.set(stat, merged);
558        }
559    }
560}
561
562#[cfg(test)]
563mod test {
564    use enum_iterator::all;
565    use itertools::Itertools;
566    use smallvec::smallvec;
567
568    use crate::LEGACY_SESSION;
569    use crate::VortexSessionExecute;
570    use crate::arrays::PrimitiveArray;
571    use crate::dtype::DType;
572    use crate::dtype::Nullability;
573    use crate::dtype::PType;
574    use crate::expr::stats::IsConstant;
575    use crate::expr::stats::Precision;
576    use crate::expr::stats::Stat;
577    use crate::expr::stats::StatsProvider;
578    use crate::expr::stats::StatsProviderExt;
579    use crate::stats::StatsSet;
580    use crate::stats::stats_set::Scalar;
581
582    #[test]
583    fn test_iter() {
584        // SAFETY: No duplicate stats.
585        let set = unsafe {
586            StatsSet::new_unchecked(smallvec![
587                (Stat::Max, Precision::exact(100)),
588                (Stat::Min, Precision::exact(42)),
589            ])
590        };
591        let mut iter = set.iter();
592        let first = iter.next().unwrap().clone();
593        assert_eq!(first.0, Stat::Max);
594        assert_eq!(
595            first.1.map(
596                |f| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(f)).unwrap()).unwrap()
597            ),
598            Precision::exact(100)
599        );
600        let snd = iter.next().unwrap().clone();
601        assert_eq!(snd.0, Stat::Min);
602        assert_eq!(
603            snd.1.map(
604                |s| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(s)).unwrap()).unwrap()
605            ),
606            Precision::exact(42)
607        );
608    }
609
610    #[test]
611    fn into_iter() {
612        // SAFETY: No duplicate stats.
613        let mut set = unsafe {
614            StatsSet::new_unchecked(smallvec![
615                (Stat::Max, Precision::exact(100)),
616                (Stat::Min, Precision::exact(42)),
617            ])
618        }
619        .into_iter();
620        let (stat, first) = set.next().unwrap();
621        assert_eq!(stat, Stat::Max);
622        assert_eq!(
623            first.map(
624                |f| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(f)).unwrap()).unwrap()
625            ),
626            Precision::exact(100)
627        );
628        let snd = set.next().unwrap();
629        assert_eq!(snd.0, Stat::Min);
630        assert_eq!(
631            snd.1.map(
632                |s| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(s)).unwrap()).unwrap()
633            ),
634            Precision::exact(42)
635        );
636    }
637
638    #[test]
639    fn merge_constant() {
640        let first = StatsSet::from_iter([
641            (Stat::Min, Precision::exact(42)),
642            (Stat::IsConstant, Precision::exact(true)),
643        ])
644        .merge_ordered(
645            &StatsSet::from_iter([
646                (Stat::Min, Precision::inexact(42)),
647                (Stat::IsConstant, Precision::exact(true)),
648            ]),
649            &DType::Primitive(PType::I32, Nullability::NonNullable),
650        );
651
652        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
653        assert_eq!(
654            first_ref.get_as::<bool>(Stat::IsConstant),
655            Precision::exact(false)
656        );
657        assert_eq!(first_ref.get_as::<i32>(Stat::Min), Precision::exact(42));
658    }
659
660    #[test]
661    fn merge_into_min() {
662        let first = StatsSet::of(Stat::Min, Precision::exact(42)).merge_ordered(
663            &StatsSet::default(),
664            &DType::Primitive(PType::I32, Nullability::NonNullable),
665        );
666
667        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
668        assert!(first_ref.get(Stat::Min).is_absent());
669    }
670
671    #[test]
672    fn merge_from_min() {
673        let first = StatsSet::default().merge_ordered(
674            &StatsSet::of(Stat::Min, Precision::exact(42)),
675            &DType::Primitive(PType::I32, Nullability::NonNullable),
676        );
677
678        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
679        assert!(first_ref.get(Stat::Min).is_absent());
680    }
681
682    #[test]
683    fn merge_mins() {
684        let first = StatsSet::of(Stat::Min, Precision::exact(37)).merge_ordered(
685            &StatsSet::of(Stat::Min, Precision::exact(42)),
686            &DType::Primitive(PType::I32, Nullability::NonNullable),
687        );
688
689        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
690        assert_eq!(first_ref.get_as::<i32>(Stat::Min), Precision::exact(37));
691    }
692
693    #[test]
694    fn merge_into_bound_max() {
695        let first = StatsSet::of(Stat::Max, Precision::exact(42)).merge_ordered(
696            &StatsSet::default(),
697            &DType::Primitive(PType::I32, Nullability::NonNullable),
698        );
699        assert!(first.get(Stat::Max).is_absent());
700    }
701
702    #[test]
703    fn merge_from_max() {
704        let first = StatsSet::default().merge_ordered(
705            &StatsSet::of(Stat::Max, Precision::exact(42)),
706            &DType::Primitive(PType::I32, Nullability::NonNullable),
707        );
708        assert!(first.get(Stat::Max).is_absent());
709    }
710
711    #[test]
712    fn merge_maxes() {
713        let first = StatsSet::of(Stat::Max, Precision::exact(37)).merge_ordered(
714            &StatsSet::of(Stat::Max, Precision::exact(42)),
715            &DType::Primitive(PType::I32, Nullability::NonNullable),
716        );
717        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
718        assert_eq!(first_ref.get_as::<i32>(Stat::Max), Precision::exact(42));
719    }
720
721    #[test]
722    fn merge_maxes_bound() {
723        let dtype = DType::Primitive(PType::I32, Nullability::NonNullable);
724        let first = StatsSet::of(Stat::Max, Precision::exact(42i32))
725            .merge_ordered(&StatsSet::of(Stat::Max, Precision::inexact(43i32)), &dtype);
726        let first_ref = first.as_typed_ref(&dtype);
727        assert_eq!(first_ref.get_as::<i32>(Stat::Max), Precision::inexact(43));
728    }
729
730    #[test]
731    fn merge_into_scalar() {
732        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
733        // for unsigned, f64 for floats).
734        let first = StatsSet::of(Stat::Sum, Precision::exact(42i64)).merge_ordered(
735            &StatsSet::default(),
736            &DType::Primitive(PType::I32, Nullability::NonNullable),
737        );
738        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
739        assert!(first_ref.get(Stat::Sum).is_absent());
740    }
741
742    #[test]
743    fn merge_from_scalar() {
744        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
745        // for unsigned, f64 for floats).
746        let first = StatsSet::default().merge_ordered(
747            &StatsSet::of(Stat::Sum, Precision::exact(42i64)),
748            &DType::Primitive(PType::I32, Nullability::NonNullable),
749        );
750        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
751        assert!(first_ref.get(Stat::Sum).is_absent());
752    }
753
754    #[test]
755    fn merge_scalars() {
756        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
757        // for unsigned, f64 for floats).
758        let first = StatsSet::of(Stat::Sum, Precision::exact(37i64)).merge_ordered(
759            &StatsSet::of(Stat::Sum, Precision::exact(42i64)),
760            &DType::Primitive(PType::I32, Nullability::NonNullable),
761        );
762        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
763        assert_eq!(first_ref.get_as::<i64>(Stat::Sum), Precision::exact(79i64));
764    }
765
766    #[test]
767    fn merge_into_sortedness() {
768        let first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true)).merge_ordered(
769            &StatsSet::default(),
770            &DType::Primitive(PType::I32, Nullability::NonNullable),
771        );
772        assert!(first.get(Stat::IsStrictSorted).is_absent());
773    }
774
775    #[test]
776    fn merge_from_sortedness() {
777        let first = StatsSet::default().merge_ordered(
778            &StatsSet::of(Stat::IsStrictSorted, Precision::exact(true)),
779            &DType::Primitive(PType::I32, Nullability::NonNullable),
780        );
781        assert!(first.get(Stat::IsStrictSorted).is_absent());
782    }
783
784    #[test]
785    fn merge_sortedness() {
786        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
787        first.set(Stat::Max, Precision::exact(1));
788        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
789        second.set(Stat::Min, Precision::exact(2));
790        first = first.merge_ordered(
791            &second,
792            &DType::Primitive(PType::I32, Nullability::NonNullable),
793        );
794
795        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
796        assert_eq!(
797            first_ref.get_as::<bool>(Stat::IsStrictSorted),
798            Precision::exact(true)
799        );
800    }
801
802    #[test]
803    fn merge_sortedness_out_of_order() {
804        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
805        first.set(Stat::Min, Precision::exact(1));
806        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
807        second.set(Stat::Max, Precision::exact(2));
808        second = second.merge_ordered(
809            &first,
810            &DType::Primitive(PType::I32, Nullability::NonNullable),
811        );
812
813        let second_ref =
814            second.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
815        assert_eq!(
816            second_ref.get_as::<bool>(Stat::IsStrictSorted),
817            Precision::inexact(false)
818        );
819    }
820
821    #[test]
822    fn merge_sortedness_only_one_sorted() {
823        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
824        first.set(Stat::Max, Precision::exact(1));
825        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(false));
826        second.set(Stat::Min, Precision::exact(2));
827        first.merge_ordered(
828            &second,
829            &DType::Primitive(PType::I32, Nullability::NonNullable),
830        );
831
832        let second_ref =
833            second.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
834        assert_eq!(
835            second_ref.get_as::<bool>(Stat::IsStrictSorted),
836            Precision::exact(false)
837        );
838    }
839
840    #[test]
841    fn merge_sortedness_missing_min() {
842        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
843        first.set(Stat::Max, Precision::exact(1));
844        let second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
845        first = first.merge_ordered(
846            &second,
847            &DType::Primitive(PType::I32, Nullability::NonNullable),
848        );
849        assert!(first.get(Stat::IsStrictSorted).is_absent());
850    }
851
852    #[test]
853    fn merge_sortedness_bound_min() {
854        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
855        first.set(Stat::Max, Precision::exact(1));
856        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
857        second.set(Stat::Min, Precision::inexact(2));
858        first = first.merge_ordered(
859            &second,
860            &DType::Primitive(PType::I32, Nullability::NonNullable),
861        );
862
863        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
864        assert_eq!(
865            first_ref.get_as::<bool>(Stat::IsStrictSorted),
866            Precision::exact(true)
867        );
868    }
869
870    #[test]
871    fn merge_unordered() {
872        let array =
873            PrimitiveArray::from_option_iter([Some(1), None, Some(2), Some(42), Some(10000), None]);
874        let all_stats = all::<Stat>()
875            .filter(|s| !matches!(s, Stat::Sum))
876            .filter(|s| !matches!(s, Stat::NaNCount))
877            .collect_vec();
878        array
879            .statistics()
880            .compute_all(&all_stats, &mut LEGACY_SESSION.create_execution_ctx())
881            .unwrap();
882
883        let stats = array.statistics().to_owned();
884        for stat in &all_stats {
885            assert!(!stats.get(*stat).is_absent(), "Stat {stat} is missing");
886        }
887
888        let merged = stats.clone().merge_unordered(
889            &stats,
890            &DType::Primitive(PType::I32, Nullability::NonNullable),
891        );
892        for stat in &all_stats {
893            assert_eq!(
894                !merged.get(*stat).is_absent(),
895                stat.is_commutative(),
896                "Stat {stat} remains after merge_unordered despite not being commutative, or was removed despite being commutative"
897            )
898        }
899
900        let merged_ref = merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::Nullable));
901        let stats_ref = stats.as_typed_ref(&DType::Primitive(PType::I32, Nullability::Nullable));
902
903        assert_eq!(
904            merged_ref.get_as::<i32>(Stat::Min),
905            stats_ref.get_as::<i32>(Stat::Min)
906        );
907        assert_eq!(
908            merged_ref.get_as::<i32>(Stat::Max),
909            stats_ref.get_as::<i32>(Stat::Max)
910        );
911        assert_eq!(
912            merged_ref.get_as::<u64>(Stat::NullCount),
913            stats_ref.get_as::<u64>(Stat::NullCount).map(|s| s * 2)
914        );
915    }
916
917    #[test]
918    fn merge_min_bound_same() {
919        // Merging a stat with a bound and another with an exact results in exact stat.
920        // since bound for min is a lower bound, it can in fact contain any value >= bound.
921        let merged = StatsSet::of(Stat::Min, Precision::inexact(5)).merge_ordered(
922            &StatsSet::of(Stat::Min, Precision::exact(5)),
923            &DType::Primitive(PType::I32, Nullability::NonNullable),
924        );
925        let merged_ref =
926            merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
927        assert_eq!(merged_ref.get_as::<i32>(Stat::Min), Precision::exact(5));
928    }
929
930    #[test]
931    fn merge_min_bound_bound_lower() {
932        let merged = StatsSet::of(Stat::Min, Precision::inexact(4)).merge_ordered(
933            &StatsSet::of(Stat::Min, Precision::exact(5)),
934            &DType::Primitive(PType::I32, Nullability::NonNullable),
935        );
936        let merged_ref =
937            merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
938        assert_eq!(merged_ref.get_as::<i32>(Stat::Min), Precision::inexact(4));
939    }
940
941    #[test]
942    fn test_combine_is_constant() {
943        {
944            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(true));
945            let stats2 = StatsSet::of(Stat::IsConstant, Precision::exact(true));
946            let mut stats_ref =
947                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
948            stats_ref
949                .combine_bool_stat::<IsConstant>(
950                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
951                )
952                .unwrap();
953            assert_eq!(
954                stats_ref.get_as::<bool>(Stat::IsConstant),
955                Precision::exact(true)
956            );
957        }
958
959        {
960            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(true));
961            let stats2 = StatsSet::of(Stat::IsConstant, Precision::inexact(false));
962            let mut stats_ref =
963                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
964            stats_ref
965                .combine_bool_stat::<IsConstant>(
966                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
967                )
968                .unwrap();
969            assert_eq!(
970                stats_ref.get_as::<bool>(Stat::IsConstant),
971                Precision::exact(true)
972            );
973        }
974
975        {
976            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(false));
977            let stats2 = StatsSet::of(Stat::IsConstant, Precision::inexact(false));
978            let mut stats_ref =
979                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
980            stats_ref
981                .combine_bool_stat::<IsConstant>(
982                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
983                )
984                .unwrap();
985            assert_eq!(
986                stats_ref.get_as::<bool>(Stat::IsConstant),
987                Precision::exact(false)
988            );
989        }
990    }
991
992    #[test]
993    fn test_combine_sets_boolean_conflict() {
994        let mut stats1 = StatsSet::from_iter([
995            (Stat::IsConstant, Precision::exact(true)),
996            (Stat::IsSorted, Precision::exact(true)),
997        ]);
998
999        let stats2 = StatsSet::from_iter([
1000            (Stat::IsConstant, Precision::exact(false)),
1001            (Stat::IsSorted, Precision::exact(true)),
1002        ]);
1003
1004        let result = stats1.combine_sets(
1005            &stats2,
1006            &DType::Primitive(PType::I32, Nullability::NonNullable),
1007        );
1008        assert!(result.is_err());
1009    }
1010
1011    #[test]
1012    fn test_combine_sets_with_missing_stats() {
1013        let mut stats1 = StatsSet::from_iter([
1014            (Stat::Min, Precision::exact(42)),
1015            (Stat::UncompressedSizeInBytes, Precision::exact(1000)),
1016        ]);
1017
1018        let stats2 = StatsSet::from_iter([
1019            (Stat::Max, Precision::exact(100)),
1020            (Stat::IsStrictSorted, Precision::exact(true)),
1021        ]);
1022
1023        stats1
1024            .combine_sets(
1025                &stats2,
1026                &DType::Primitive(PType::I32, Nullability::NonNullable),
1027            )
1028            .unwrap();
1029
1030        let stats_ref =
1031            stats1.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
1032
1033        // Min should remain unchanged
1034        assert_eq!(stats_ref.get_as::<i32>(Stat::Min), Precision::exact(42));
1035        // Max should be added
1036        assert_eq!(stats_ref.get_as::<i32>(Stat::Max), Precision::exact(100));
1037        // IsStrictSorted should be added
1038        assert_eq!(
1039            stats_ref.get_as::<bool>(Stat::IsStrictSorted),
1040            Precision::exact(true)
1041        );
1042    }
1043
1044    #[test]
1045    fn test_combine_sets_with_inexact() {
1046        let mut stats1 = StatsSet::from_iter([
1047            (Stat::Min, Precision::exact(42)),
1048            (Stat::Max, Precision::inexact(100)),
1049            (Stat::IsConstant, Precision::exact(false)),
1050        ]);
1051
1052        let stats2 = StatsSet::from_iter([
1053            // Must ensure Min from stats2 is <= Min from stats1
1054            (Stat::Min, Precision::inexact(40)),
1055            (Stat::Max, Precision::exact(90)),
1056            (Stat::IsSorted, Precision::exact(true)),
1057        ]);
1058
1059        stats1
1060            .combine_sets(
1061                &stats2,
1062                &DType::Primitive(PType::I32, Nullability::NonNullable),
1063            )
1064            .unwrap();
1065
1066        let stats_ref =
1067            stats1.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
1068
1069        // Min should remain unchanged since it's more restrictive than the inexact value
1070        assert_eq!(stats_ref.get_as::<i32>(Stat::Min), Precision::exact(42));
1071        // Check that max was updated with the exact value
1072        assert_eq!(stats_ref.get_as::<i32>(Stat::Max), Precision::exact(90));
1073        // Check that IsSorted was added
1074        assert_eq!(
1075            stats_ref.get_as::<bool>(Stat::IsSorted),
1076            Precision::exact(true)
1077        );
1078    }
1079}