Skip to main content

vortex_array/stats/
stats_set.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5
6use enum_iterator::Sequence;
7use enum_iterator::all;
8use num_traits::CheckedAdd;
9use vortex_error::VortexError;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_err;
13use vortex_error::vortex_panic;
14
15use crate::dtype::DType;
16use crate::expr::stats::IsConstant;
17use crate::expr::stats::IsSorted;
18use crate::expr::stats::IsStrictSorted;
19use crate::expr::stats::Max;
20use crate::expr::stats::Min;
21use crate::expr::stats::NaNCount;
22use crate::expr::stats::NullCount;
23use crate::expr::stats::Precision;
24use crate::expr::stats::Stat;
25use crate::expr::stats::StatBound;
26use crate::expr::stats::StatType;
27use crate::expr::stats::StatsProvider;
28use crate::expr::stats::StatsProviderExt;
29use crate::expr::stats::Sum;
30use crate::expr::stats::UncompressedSizeInBytes;
31use crate::scalar::Scalar;
32use crate::scalar::ScalarValue;
33
34#[derive(Default, Debug, Clone)]
35pub struct StatsSet {
36    values: Vec<(Stat, Precision<ScalarValue>)>,
37}
38
39impl StatsSet {
40    /// Create new StatSet without validating uniqueness of all the entries
41    ///
42    /// # Safety
43    ///
44    /// This method will not panic or trigger UB, but may lead to duplicate stats being stored.
45    pub unsafe fn new_unchecked(values: Vec<(Stat, Precision<ScalarValue>)>) -> Self {
46        Self { values }
47    }
48
49    /// Create StatsSet from single stat and value
50    pub fn of(stat: Stat, value: Precision<ScalarValue>) -> Self {
51        // SAFETY: No duplicate stats will be set here.
52        unsafe { Self::new_unchecked(vec![(stat, value)]) }
53    }
54
55    fn reserve_full_capacity(&mut self) {
56        if self.values.capacity() < Stat::CARDINALITY {
57            self.values
58                .reserve_exact(Stat::CARDINALITY - self.values.capacity());
59        }
60    }
61
62    /// Wrap stats set with a dtype for mutable typed scalar access
63    pub fn as_mut_typed_ref<'a, 'b>(&'a mut self, dtype: &'b DType) -> MutTypedStatsSetRef<'a, 'b> {
64        MutTypedStatsSetRef {
65            values: self,
66            dtype,
67        }
68    }
69
70    /// Wrap stats set with a dtype for typed scalar access
71    pub fn as_typed_ref<'a, 'b>(&'a self, dtype: &'b DType) -> TypedStatsSetRef<'a, 'b> {
72        TypedStatsSetRef {
73            values: self,
74            dtype,
75        }
76    }
77}
78
79// Getters and setters for individual stats.
80impl StatsSet {
81    /// Set the stat `stat` to `value`.
82    pub fn set(&mut self, stat: Stat, value: Precision<ScalarValue>) {
83        self.reserve_full_capacity();
84
85        if let Some(existing) = self.values.iter_mut().find(|(s, _)| *s == stat) {
86            *existing = (stat, value);
87        } else {
88            self.values.push((stat, value));
89        }
90    }
91
92    /// Clear the stat `stat` from the set.
93    pub fn clear(&mut self, stat: Stat) {
94        self.values.retain(|(s, _)| *s != stat);
95    }
96
97    /// Only keep given stats
98    pub fn retain_only(&mut self, stats: &[Stat]) {
99        self.values.retain(|(s, _)| stats.contains(s));
100    }
101
102    /// Iterate over the statistic names and values in-place.
103    ///
104    /// See [Iterator].
105    pub fn iter(&self) -> impl Iterator<Item = &(Stat, Precision<ScalarValue>)> {
106        self.values.iter()
107    }
108
109    /// Get value for a given stat
110    pub fn get(&self, stat: Stat) -> Option<Precision<ScalarValue>> {
111        self.values
112            .iter()
113            .find(|(s, _)| *s == stat)
114            .map(|(_, v)| v.clone())
115    }
116
117    /// Length of the stats set
118    pub fn len(&self) -> usize {
119        self.values.len()
120    }
121
122    /// Check whether the statset is empty
123    pub fn is_empty(&self) -> bool {
124        self.values.is_empty()
125    }
126
127    /// Get scalar value of a given dtype
128    pub fn get_as<T: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(
129        &self,
130        stat: Stat,
131        dtype: &DType,
132    ) -> Option<Precision<T>> {
133        self.get(stat).map(|v| {
134            v.map(|v| {
135                T::try_from(
136                    &Scalar::try_new(dtype.clone(), Some(v))
137                        .vortex_expect("failed to construct a scalar statistic"),
138                )
139                .unwrap_or_else(|err| {
140                    vortex_panic!(
141                        err,
142                        "Failed to get stat {} as {}",
143                        stat,
144                        std::any::type_name::<T>()
145                    )
146                })
147            })
148        })
149    }
150}
151
152// StatSetIntoIter just exists to protect current implementation from exposure on the public API.
153
154/// Owned iterator over the stats.
155///
156/// See [IntoIterator].
157pub struct StatsSetIntoIter(std::vec::IntoIter<(Stat, Precision<ScalarValue>)>);
158
159impl Iterator for StatsSetIntoIter {
160    type Item = (Stat, Precision<ScalarValue>);
161
162    fn next(&mut self) -> Option<Self::Item> {
163        self.0.next()
164    }
165}
166
167impl IntoIterator for StatsSet {
168    type Item = (Stat, Precision<ScalarValue>);
169    type IntoIter = StatsSetIntoIter;
170
171    fn into_iter(self) -> Self::IntoIter {
172        StatsSetIntoIter(self.values.into_iter())
173    }
174}
175
176impl FromIterator<(Stat, Precision<ScalarValue>)> for StatsSet {
177    fn from_iter<T: IntoIterator<Item = (Stat, Precision<ScalarValue>)>>(iter: T) -> Self {
178        let iter = iter.into_iter();
179        let mut values = Vec::default();
180        values.reserve_exact(Stat::CARDINALITY);
181
182        let mut this = Self { values };
183        this.extend(iter);
184        this
185    }
186}
187
188impl Extend<(Stat, Precision<ScalarValue>)> for StatsSet {
189    #[inline]
190    fn extend<T: IntoIterator<Item = (Stat, Precision<ScalarValue>)>>(&mut self, iter: T) {
191        let iter = iter.into_iter();
192        self.reserve_full_capacity();
193
194        iter.for_each(|(stat, value)| self.set(stat, value));
195    }
196}
197
198/// Merge helpers
199impl StatsSet {
200    /// Merge stats set `other` into `self`, with the semantic assumption that `other`
201    /// contains stats from a disjoint array that is *appended* to the array represented by `self`.
202    pub fn merge_ordered(mut self, other: &Self, dtype: &DType) -> Self {
203        self.as_mut_typed_ref(dtype)
204            .merge_ordered(&other.as_typed_ref(dtype));
205        self
206    }
207
208    /// Merge stats set `other` into `self`, from a disjoint array, with no ordering assumptions.
209    /// Stats that are not commutative (e.g., is_sorted) are dropped from the result.
210    pub fn merge_unordered(mut self, other: &Self, dtype: &DType) -> Self {
211        self.as_mut_typed_ref(dtype)
212            .merge_unordered(&other.as_typed_ref(dtype));
213        self
214    }
215
216    /// Given two sets of stats (of differing precision) for the same array, combine them
217    pub fn combine_sets(&mut self, other: &Self, dtype: &DType) -> VortexResult<()> {
218        self.as_mut_typed_ref(dtype)
219            .combine_sets(&other.as_typed_ref(dtype))
220    }
221}
222
223pub struct TypedStatsSetRef<'a, 'b> {
224    pub values: &'a StatsSet,
225    pub dtype: &'b DType,
226}
227
228impl StatsProvider for TypedStatsSetRef<'_, '_> {
229    fn get(&self, stat: Stat) -> Option<Precision<Scalar>> {
230        self.values.get(stat).map(|p| {
231            p.map(|sv| {
232                Scalar::try_new(
233                    stat.dtype(self.dtype)
234                        .vortex_expect("Must have valid dtype if value is present"),
235                    Some(sv),
236                )
237                .vortex_expect("failed to construct a scalar statistic")
238            })
239        })
240    }
241
242    fn len(&self) -> usize {
243        self.values.len()
244    }
245}
246
247pub struct MutTypedStatsSetRef<'a, 'b> {
248    pub values: &'a mut StatsSet,
249    pub dtype: &'b DType,
250}
251
252impl MutTypedStatsSetRef<'_, '_> {
253    /// Set the stat `stat` to `value`.
254    pub fn set(&mut self, stat: Stat, value: Precision<ScalarValue>) {
255        self.values.set(stat, value);
256    }
257
258    /// Clear the stat `stat` from the set.
259    pub fn clear(&mut self, stat: Stat) {
260        self.values.clear(stat);
261    }
262}
263
264impl StatsProvider for MutTypedStatsSetRef<'_, '_> {
265    fn get(&self, stat: Stat) -> Option<Precision<Scalar>> {
266        self.values.get(stat).map(|p| {
267            p.map(|sv| {
268                Scalar::try_new(
269                    stat.dtype(self.dtype)
270                        .vortex_expect("Must have valid dtype if value is present"),
271                    Some(sv),
272                )
273                .vortex_expect("failed to construct a scalar statistic")
274            })
275        })
276    }
277
278    fn len(&self) -> usize {
279        self.values.len()
280    }
281}
282
283// Merge helpers
284impl MutTypedStatsSetRef<'_, '_> {
285    /// Merge stats set `other` into `self`, with the semantic assumption that `other`
286    /// contains stats from a disjoint array that is *appended* to the array represented by `self`.
287    pub fn merge_ordered(mut self, other: &TypedStatsSetRef) -> Self {
288        for s in all::<Stat>() {
289            match s {
290                Stat::IsConstant => self.merge_is_constant(other),
291                Stat::IsSorted => self.merge_is_sorted(other),
292                Stat::IsStrictSorted => self.merge_is_strict_sorted(other),
293                Stat::Max => self.merge_max(other),
294                Stat::Min => self.merge_min(other),
295                Stat::Sum => self.merge_sum(other),
296                Stat::NullCount => self.merge_null_count(other),
297                Stat::UncompressedSizeInBytes => self.merge_uncompressed_size_in_bytes(other),
298                Stat::NaNCount => self.merge_nan_count(other),
299            }
300        }
301
302        self
303    }
304
305    /// Merge stats set `other` into `self`, from a disjoint array, with no ordering assumptions.
306    /// Stats that are not commutative (e.g., is_sorted) are dropped from the result.
307    pub fn merge_unordered(mut self, other: &TypedStatsSetRef) -> Self {
308        for s in all::<Stat>() {
309            if !s.is_commutative() {
310                self.clear(s);
311                continue;
312            }
313
314            match s {
315                Stat::IsConstant => self.merge_is_constant(other),
316                Stat::Max => self.merge_max(other),
317                Stat::Min => self.merge_min(other),
318                Stat::Sum => self.merge_sum(other),
319                Stat::NullCount => self.merge_null_count(other),
320                Stat::UncompressedSizeInBytes => self.merge_uncompressed_size_in_bytes(other),
321                Stat::IsSorted | Stat::IsStrictSorted => {
322                    unreachable!("not commutative")
323                }
324                Stat::NaNCount => self.merge_nan_count(other),
325            }
326        }
327
328        self
329    }
330
331    /// Given two sets of stats (of differing precision) for the same array, combine them
332    pub fn combine_sets(&mut self, other: &TypedStatsSetRef) -> VortexResult<()> {
333        let other_stats: Vec<_> = other.values.iter().map(|(stat, _)| *stat).collect();
334        for s in other_stats {
335            match s {
336                Stat::Max => self.combine_bound::<Max>(other)?,
337                Stat::Min => self.combine_bound::<Min>(other)?,
338                Stat::UncompressedSizeInBytes => {
339                    self.combine_bound::<UncompressedSizeInBytes>(other)?
340                }
341                Stat::IsConstant => self.combine_bool_stat::<IsConstant>(other)?,
342                Stat::IsSorted => self.combine_bool_stat::<IsSorted>(other)?,
343                Stat::IsStrictSorted => self.combine_bool_stat::<IsStrictSorted>(other)?,
344                Stat::NullCount => self.combine_bound::<NullCount>(other)?,
345                Stat::Sum => self.combine_bound::<Sum>(other)?,
346                Stat::NaNCount => self.combine_bound::<NaNCount>(other)?,
347            }
348        }
349        Ok(())
350    }
351
352    fn combine_bound<S: StatType<Scalar>>(&mut self, other: &TypedStatsSetRef) -> VortexResult<()>
353    where
354        S::Bound: StatBound<Scalar> + Debug + Eq + PartialEq,
355    {
356        match (self.get_scalar_bound::<S>(), other.get_scalar_bound::<S>()) {
357            (Some(m1), Some(m2)) => {
358                let meet = m1
359                    .intersection(&m2)
360                    .vortex_expect("can always compare scalar")
361                    .ok_or_else(|| {
362                        vortex_err!("{:?} bounds ({m1:?}, {m2:?}) do not overlap", S::STAT)
363                    })?;
364                if meet != m1 {
365                    self.set(
366                        S::STAT,
367                        meet.into_value().map(|s| {
368                            s.into_value()
369                                .vortex_expect("stat scalar value cannot be null")
370                        }),
371                    );
372                }
373            }
374            (None, Some(m)) => self.set(
375                S::STAT,
376                m.into_value().map(|s| {
377                    s.into_value()
378                        .vortex_expect("stat scalar value cannot be null")
379                }),
380            ),
381            (Some(_), _) => (),
382            (None, None) => self.clear(S::STAT),
383        }
384        Ok(())
385    }
386
387    fn combine_bool_stat<S: StatType<bool>>(&mut self, other: &TypedStatsSetRef) -> VortexResult<()>
388    where
389        S::Bound: StatBound<bool> + Debug + Eq + PartialEq,
390    {
391        match (
392            self.get_as_bound::<S, bool>(),
393            other.get_as_bound::<S, bool>(),
394        ) {
395            (Some(m1), Some(m2)) => {
396                let intersection = m1
397                    .intersection(&m2)
398                    .vortex_expect("can always compare boolean")
399                    .ok_or_else(|| {
400                        vortex_err!("{:?} bounds ({m1:?}, {m2:?}) do not overlap", S::STAT)
401                    })?;
402                if intersection != m1 {
403                    self.set(S::STAT, intersection.into_value().map(ScalarValue::from));
404                }
405            }
406            (None, Some(m)) => self.set(S::STAT, m.into_value().map(ScalarValue::from)),
407            (Some(_), None) => (),
408            (None, None) => self.clear(S::STAT),
409        }
410        Ok(())
411    }
412
413    fn merge_min(&mut self, other: &TypedStatsSetRef) {
414        match (
415            self.get_scalar_bound::<Min>(),
416            other.get_scalar_bound::<Min>(),
417        ) {
418            (Some(m1), Some(m2)) => {
419                let meet = m1.union(&m2).vortex_expect("can compare scalar");
420                if meet != m1 {
421                    self.set(
422                        Stat::Min,
423                        meet.into_value().map(|s| {
424                            s.into_value()
425                                .vortex_expect("stat scalar value cannot be null")
426                        }),
427                    );
428                }
429            }
430            _ => self.clear(Stat::Min),
431        }
432    }
433
434    fn merge_max(&mut self, other: &TypedStatsSetRef) {
435        match (
436            self.get_scalar_bound::<Max>(),
437            other.get_scalar_bound::<Max>(),
438        ) {
439            (Some(m1), Some(m2)) => {
440                let meet = m1.union(&m2).vortex_expect("can compare scalar");
441                if meet != m1 {
442                    self.set(
443                        Stat::Max,
444                        meet.into_value().map(|s| {
445                            s.into_value()
446                                .vortex_expect("stat scalar value cannot be null")
447                        }),
448                    );
449                }
450            }
451            _ => self.clear(Stat::Max),
452        }
453    }
454
455    fn merge_sum(&mut self, other: &TypedStatsSetRef) {
456        match (
457            self.get_scalar_bound::<Sum>(),
458            other.get_scalar_bound::<Sum>(),
459        ) {
460            (Some(m1), Some(m2)) => {
461                // If the combine sum is exact, then we can sum them.
462                if let Some(scalar_value) =
463                    m1.zip(m2).as_exact().and_then(|(s1, s2)| match s1.dtype() {
464                        DType::Primitive(..) => s1
465                            .as_primitive()
466                            .checked_add(&s2.as_primitive())
467                            .and_then(|pscalar| pscalar.pvalue().map(ScalarValue::Primitive)),
468                        DType::Decimal(..) => s1
469                            .as_decimal()
470                            .checked_binary_numeric(
471                                &s2.as_decimal(),
472                                crate::scalar::NumericOperator::Add,
473                            )
474                            .map(|scalar| {
475                                ScalarValue::Decimal(
476                                    scalar
477                                        .decimal_value()
478                                        .vortex_expect("no decimal value in scalar"),
479                                )
480                            }),
481                        _ => None,
482                    })
483                {
484                    self.set(Stat::Sum, Precision::Exact(scalar_value));
485                }
486            }
487            _ => self.clear(Stat::Sum),
488        }
489    }
490
491    fn merge_is_constant(&mut self, other: &TypedStatsSetRef) {
492        let self_const = self.get_as(Stat::IsConstant);
493        let other_const = other.get_as(Stat::IsConstant);
494        let self_min = self.get(Stat::Min);
495        let other_min = other.get(Stat::Min);
496
497        if let (
498            Some(Precision::Exact(self_const)),
499            Some(Precision::Exact(other_const)),
500            Some(Precision::Exact(self_min)),
501            Some(Precision::Exact(other_min)),
502        ) = (self_const, other_const, self_min, other_min)
503        {
504            if self_const && other_const && self_min == other_min {
505                self.set(Stat::IsConstant, Precision::exact(true));
506            } else {
507                self.set(Stat::IsConstant, Precision::inexact(false));
508            }
509        }
510        self.set(Stat::IsConstant, Precision::exact(false));
511    }
512
513    fn merge_is_sorted(&mut self, other: &TypedStatsSetRef) {
514        self.merge_sortedness_stat(other, Stat::IsSorted, PartialOrd::le)
515    }
516
517    fn merge_is_strict_sorted(&mut self, other: &TypedStatsSetRef) {
518        self.merge_sortedness_stat(other, Stat::IsStrictSorted, PartialOrd::lt)
519    }
520
521    fn merge_sortedness_stat<F: Fn(&Scalar, &Scalar) -> bool>(
522        &mut self,
523        other: &TypedStatsSetRef,
524        stat: Stat,
525        cmp: F,
526    ) {
527        if (Some(Precision::Exact(true)), Some(Precision::Exact(true)))
528            == (self.get_as(stat), other.get_as(stat))
529        {
530            // There might be no stat because it was dropped, or it doesn't exist
531            // (e.g. an all null array).
532            // We assume that it was the dropped case since the doesn't exist might imply sorted,
533            // but this in-precision is correct.
534            if let (Some(self_max), Some(other_min)) = (
535                self.get_scalar_bound::<Max>(),
536                other.get_scalar_bound::<Min>(),
537            ) {
538                return if cmp(&self_max.max_value(), &other_min.min_value()) {
539                    // keep value
540                } else {
541                    self.set(stat, Precision::inexact(false));
542                };
543            }
544        }
545        self.clear(stat);
546    }
547
548    fn merge_null_count(&mut self, other: &TypedStatsSetRef) {
549        self.merge_sum_stat(Stat::NullCount, other)
550    }
551
552    fn merge_nan_count(&mut self, other: &TypedStatsSetRef) {
553        self.merge_sum_stat(Stat::NaNCount, other)
554    }
555
556    fn merge_uncompressed_size_in_bytes(&mut self, other: &TypedStatsSetRef) {
557        self.merge_sum_stat(Stat::UncompressedSizeInBytes, other)
558    }
559
560    fn merge_sum_stat(&mut self, stat: Stat, other: &TypedStatsSetRef) {
561        match (self.get_as::<usize>(stat), other.get_as::<usize>(stat)) {
562            (Some(nc1), Some(nc2)) => {
563                self.set(
564                    stat,
565                    nc1.zip(nc2).map(|(nc1, nc2)| ScalarValue::from(nc1 + nc2)),
566                );
567            }
568            _ => self.clear(stat),
569        }
570    }
571}
572
573#[cfg(test)]
574mod test {
575    use enum_iterator::all;
576    use itertools::Itertools;
577
578    use crate::LEGACY_SESSION;
579    use crate::VortexSessionExecute;
580    use crate::arrays::PrimitiveArray;
581    use crate::dtype::DType;
582    use crate::dtype::Nullability;
583    use crate::dtype::PType;
584    use crate::expr::stats::IsConstant;
585    use crate::expr::stats::Precision;
586    use crate::expr::stats::Stat;
587    use crate::expr::stats::StatsProvider;
588    use crate::expr::stats::StatsProviderExt;
589    use crate::stats::StatsSet;
590    use crate::stats::stats_set::Scalar;
591
592    #[test]
593    fn test_iter() {
594        // SAFETY: No duplicate stats.
595        let set = unsafe {
596            StatsSet::new_unchecked(vec![
597                (Stat::Max, Precision::exact(100)),
598                (Stat::Min, Precision::exact(42)),
599            ])
600        };
601        let mut iter = set.iter();
602        let first = iter.next().unwrap().clone();
603        assert_eq!(first.0, Stat::Max);
604        assert_eq!(
605            first.1.map(
606                |f| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(f)).unwrap()).unwrap()
607            ),
608            Precision::exact(100)
609        );
610        let snd = iter.next().unwrap().clone();
611        assert_eq!(snd.0, Stat::Min);
612        assert_eq!(
613            snd.1.map(
614                |s| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(s)).unwrap()).unwrap()
615            ),
616            Precision::exact(42)
617        );
618    }
619
620    #[test]
621    fn into_iter() {
622        // SAFETY: No duplicate stats.
623        let mut set = unsafe {
624            StatsSet::new_unchecked(vec![
625                (Stat::Max, Precision::exact(100)),
626                (Stat::Min, Precision::exact(42)),
627            ])
628        }
629        .into_iter();
630        let (stat, first) = set.next().unwrap();
631        assert_eq!(stat, Stat::Max);
632        assert_eq!(
633            first.map(
634                |f| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(f)).unwrap()).unwrap()
635            ),
636            Precision::exact(100)
637        );
638        let snd = set.next().unwrap();
639        assert_eq!(snd.0, Stat::Min);
640        assert_eq!(
641            snd.1.map(
642                |s| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(s)).unwrap()).unwrap()
643            ),
644            Precision::exact(42)
645        );
646    }
647
648    #[test]
649    fn merge_constant() {
650        let first = StatsSet::from_iter([
651            (Stat::Min, Precision::exact(42)),
652            (Stat::IsConstant, Precision::exact(true)),
653        ])
654        .merge_ordered(
655            &StatsSet::from_iter([
656                (Stat::Min, Precision::inexact(42)),
657                (Stat::IsConstant, Precision::exact(true)),
658            ]),
659            &DType::Primitive(PType::I32, Nullability::NonNullable),
660        );
661
662        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
663        assert_eq!(
664            first_ref.get_as::<bool>(Stat::IsConstant),
665            Some(Precision::exact(false))
666        );
667        assert_eq!(
668            first_ref.get_as::<i32>(Stat::Min),
669            Some(Precision::exact(42))
670        );
671    }
672
673    #[test]
674    fn merge_into_min() {
675        let first = StatsSet::of(Stat::Min, Precision::exact(42)).merge_ordered(
676            &StatsSet::default(),
677            &DType::Primitive(PType::I32, Nullability::NonNullable),
678        );
679
680        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
681        assert!(first_ref.get(Stat::Min).is_none());
682    }
683
684    #[test]
685    fn merge_from_min() {
686        let first = StatsSet::default().merge_ordered(
687            &StatsSet::of(Stat::Min, Precision::exact(42)),
688            &DType::Primitive(PType::I32, Nullability::NonNullable),
689        );
690
691        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
692        assert!(first_ref.get(Stat::Min).is_none());
693    }
694
695    #[test]
696    fn merge_mins() {
697        let first = StatsSet::of(Stat::Min, Precision::exact(37)).merge_ordered(
698            &StatsSet::of(Stat::Min, Precision::exact(42)),
699            &DType::Primitive(PType::I32, Nullability::NonNullable),
700        );
701
702        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
703        assert_eq!(
704            first_ref.get_as::<i32>(Stat::Min),
705            Some(Precision::exact(37))
706        );
707    }
708
709    #[test]
710    fn merge_into_bound_max() {
711        let first = StatsSet::of(Stat::Max, Precision::exact(42)).merge_ordered(
712            &StatsSet::default(),
713            &DType::Primitive(PType::I32, Nullability::NonNullable),
714        );
715        assert!(first.get(Stat::Max).is_none());
716    }
717
718    #[test]
719    fn merge_from_max() {
720        let first = StatsSet::default().merge_ordered(
721            &StatsSet::of(Stat::Max, Precision::exact(42)),
722            &DType::Primitive(PType::I32, Nullability::NonNullable),
723        );
724        assert!(first.get(Stat::Max).is_none());
725    }
726
727    #[test]
728    fn merge_maxes() {
729        let first = StatsSet::of(Stat::Max, Precision::exact(37)).merge_ordered(
730            &StatsSet::of(Stat::Max, Precision::exact(42)),
731            &DType::Primitive(PType::I32, Nullability::NonNullable),
732        );
733        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
734        assert_eq!(
735            first_ref.get_as::<i32>(Stat::Max),
736            Some(Precision::exact(42))
737        );
738    }
739
740    #[test]
741    fn merge_maxes_bound() {
742        let dtype = DType::Primitive(PType::I32, Nullability::NonNullable);
743        let first = StatsSet::of(Stat::Max, Precision::exact(42i32))
744            .merge_ordered(&StatsSet::of(Stat::Max, Precision::inexact(43i32)), &dtype);
745        let first_ref = first.as_typed_ref(&dtype);
746        assert_eq!(
747            first_ref.get_as::<i32>(Stat::Max),
748            Some(Precision::inexact(43))
749        );
750    }
751
752    #[test]
753    fn merge_into_scalar() {
754        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
755        // for unsigned, f64 for floats).
756        let first = StatsSet::of(Stat::Sum, Precision::exact(42i64)).merge_ordered(
757            &StatsSet::default(),
758            &DType::Primitive(PType::I32, Nullability::NonNullable),
759        );
760        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
761        assert!(first_ref.get(Stat::Sum).is_none());
762    }
763
764    #[test]
765    fn merge_from_scalar() {
766        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
767        // for unsigned, f64 for floats).
768        let first = StatsSet::default().merge_ordered(
769            &StatsSet::of(Stat::Sum, Precision::exact(42i64)),
770            &DType::Primitive(PType::I32, Nullability::NonNullable),
771        );
772        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
773        assert!(first_ref.get(Stat::Sum).is_none());
774    }
775
776    #[test]
777    fn merge_scalars() {
778        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
779        // for unsigned, f64 for floats).
780        let first = StatsSet::of(Stat::Sum, Precision::exact(37i64)).merge_ordered(
781            &StatsSet::of(Stat::Sum, Precision::exact(42i64)),
782            &DType::Primitive(PType::I32, Nullability::NonNullable),
783        );
784        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
785        assert_eq!(
786            first_ref.get_as::<i64>(Stat::Sum),
787            Some(Precision::exact(79i64))
788        );
789    }
790
791    #[test]
792    fn merge_into_sortedness() {
793        let first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true)).merge_ordered(
794            &StatsSet::default(),
795            &DType::Primitive(PType::I32, Nullability::NonNullable),
796        );
797        assert!(first.get(Stat::IsStrictSorted).is_none());
798    }
799
800    #[test]
801    fn merge_from_sortedness() {
802        let first = StatsSet::default().merge_ordered(
803            &StatsSet::of(Stat::IsStrictSorted, Precision::exact(true)),
804            &DType::Primitive(PType::I32, Nullability::NonNullable),
805        );
806        assert!(first.get(Stat::IsStrictSorted).is_none());
807    }
808
809    #[test]
810    fn merge_sortedness() {
811        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
812        first.set(Stat::Max, Precision::exact(1));
813        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
814        second.set(Stat::Min, Precision::exact(2));
815        first = first.merge_ordered(
816            &second,
817            &DType::Primitive(PType::I32, Nullability::NonNullable),
818        );
819
820        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
821        assert_eq!(
822            first_ref.get_as::<bool>(Stat::IsStrictSorted),
823            Some(Precision::exact(true))
824        );
825    }
826
827    #[test]
828    fn merge_sortedness_out_of_order() {
829        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
830        first.set(Stat::Min, Precision::exact(1));
831        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
832        second.set(Stat::Max, Precision::exact(2));
833        second = second.merge_ordered(
834            &first,
835            &DType::Primitive(PType::I32, Nullability::NonNullable),
836        );
837
838        let second_ref =
839            second.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
840        assert_eq!(
841            second_ref.get_as::<bool>(Stat::IsStrictSorted),
842            Some(Precision::inexact(false))
843        );
844    }
845
846    #[test]
847    fn merge_sortedness_only_one_sorted() {
848        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
849        first.set(Stat::Max, Precision::exact(1));
850        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(false));
851        second.set(Stat::Min, Precision::exact(2));
852        first.merge_ordered(
853            &second,
854            &DType::Primitive(PType::I32, Nullability::NonNullable),
855        );
856
857        let second_ref =
858            second.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
859        assert_eq!(
860            second_ref.get_as::<bool>(Stat::IsStrictSorted),
861            Some(Precision::exact(false))
862        );
863    }
864
865    #[test]
866    fn merge_sortedness_missing_min() {
867        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
868        first.set(Stat::Max, Precision::exact(1));
869        let second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
870        first = first.merge_ordered(
871            &second,
872            &DType::Primitive(PType::I32, Nullability::NonNullable),
873        );
874        assert!(first.get(Stat::IsStrictSorted).is_none());
875    }
876
877    #[test]
878    fn merge_sortedness_bound_min() {
879        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
880        first.set(Stat::Max, Precision::exact(1));
881        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
882        second.set(Stat::Min, Precision::inexact(2));
883        first = first.merge_ordered(
884            &second,
885            &DType::Primitive(PType::I32, Nullability::NonNullable),
886        );
887
888        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
889        assert_eq!(
890            first_ref.get_as::<bool>(Stat::IsStrictSorted),
891            Some(Precision::exact(true))
892        );
893    }
894
895    #[test]
896    fn merge_unordered() {
897        let array =
898            PrimitiveArray::from_option_iter([Some(1), None, Some(2), Some(42), Some(10000), None]);
899        let all_stats = all::<Stat>()
900            .filter(|s| !matches!(s, Stat::Sum))
901            .filter(|s| !matches!(s, Stat::NaNCount))
902            .collect_vec();
903        array
904            .statistics()
905            .compute_all(&all_stats, &mut LEGACY_SESSION.create_execution_ctx())
906            .unwrap();
907
908        let stats = array.statistics().to_owned();
909        for stat in &all_stats {
910            assert!(stats.get(*stat).is_some(), "Stat {stat} is missing");
911        }
912
913        let merged = stats.clone().merge_unordered(
914            &stats,
915            &DType::Primitive(PType::I32, Nullability::NonNullable),
916        );
917        for stat in &all_stats {
918            assert_eq!(
919                merged.get(*stat).is_some(),
920                stat.is_commutative(),
921                "Stat {stat} remains after merge_unordered despite not being commutative, or was removed despite being commutative"
922            )
923        }
924
925        let merged_ref = merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::Nullable));
926        let stats_ref = stats.as_typed_ref(&DType::Primitive(PType::I32, Nullability::Nullable));
927
928        assert_eq!(
929            merged_ref.get_as::<i32>(Stat::Min),
930            stats_ref.get_as::<i32>(Stat::Min)
931        );
932        assert_eq!(
933            merged_ref.get_as::<i32>(Stat::Max),
934            stats_ref.get_as::<i32>(Stat::Max)
935        );
936        assert_eq!(
937            merged_ref.get_as::<u64>(Stat::NullCount).unwrap(),
938            stats_ref
939                .get_as::<u64>(Stat::NullCount)
940                .unwrap()
941                .map(|s| s * 2)
942        );
943    }
944
945    #[test]
946    fn merge_min_bound_same() {
947        // Merging a stat with a bound and another with an exact results in exact stat.
948        // since bound for min is a lower bound, it can in fact contain any value >= bound.
949        let merged = StatsSet::of(Stat::Min, Precision::inexact(5)).merge_ordered(
950            &StatsSet::of(Stat::Min, Precision::exact(5)),
951            &DType::Primitive(PType::I32, Nullability::NonNullable),
952        );
953        let merged_ref =
954            merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
955        assert_eq!(
956            merged_ref.get_as::<i32>(Stat::Min),
957            Some(Precision::exact(5))
958        );
959    }
960
961    #[test]
962    fn merge_min_bound_bound_lower() {
963        let merged = StatsSet::of(Stat::Min, Precision::inexact(4)).merge_ordered(
964            &StatsSet::of(Stat::Min, Precision::exact(5)),
965            &DType::Primitive(PType::I32, Nullability::NonNullable),
966        );
967        let merged_ref =
968            merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
969        assert_eq!(
970            merged_ref.get_as::<i32>(Stat::Min),
971            Some(Precision::inexact(4))
972        );
973    }
974
975    #[test]
976    fn test_combine_is_constant() {
977        {
978            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(true));
979            let stats2 = StatsSet::of(Stat::IsConstant, Precision::exact(true));
980            let mut stats_ref =
981                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
982            stats_ref
983                .combine_bool_stat::<IsConstant>(
984                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
985                )
986                .unwrap();
987            assert_eq!(
988                stats_ref.get_as::<bool>(Stat::IsConstant),
989                Some(Precision::exact(true))
990            );
991        }
992
993        {
994            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(true));
995            let stats2 = StatsSet::of(Stat::IsConstant, Precision::inexact(false));
996            let mut stats_ref =
997                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
998            stats_ref
999                .combine_bool_stat::<IsConstant>(
1000                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
1001                )
1002                .unwrap();
1003            assert_eq!(
1004                stats_ref.get_as::<bool>(Stat::IsConstant),
1005                Some(Precision::exact(true))
1006            );
1007        }
1008
1009        {
1010            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(false));
1011            let stats2 = StatsSet::of(Stat::IsConstant, Precision::inexact(false));
1012            let mut stats_ref =
1013                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
1014            stats_ref
1015                .combine_bool_stat::<IsConstant>(
1016                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
1017                )
1018                .unwrap();
1019            assert_eq!(
1020                stats_ref.get_as::<bool>(Stat::IsConstant),
1021                Some(Precision::exact(false))
1022            );
1023        }
1024    }
1025
1026    #[test]
1027    fn test_combine_sets_boolean_conflict() {
1028        let mut stats1 = StatsSet::from_iter([
1029            (Stat::IsConstant, Precision::exact(true)),
1030            (Stat::IsSorted, Precision::exact(true)),
1031        ]);
1032
1033        let stats2 = StatsSet::from_iter([
1034            (Stat::IsConstant, Precision::exact(false)),
1035            (Stat::IsSorted, Precision::exact(true)),
1036        ]);
1037
1038        let result = stats1.combine_sets(
1039            &stats2,
1040            &DType::Primitive(PType::I32, Nullability::NonNullable),
1041        );
1042        assert!(result.is_err());
1043    }
1044
1045    #[test]
1046    fn test_combine_sets_with_missing_stats() {
1047        let mut stats1 = StatsSet::from_iter([
1048            (Stat::Min, Precision::exact(42)),
1049            (Stat::UncompressedSizeInBytes, Precision::exact(1000)),
1050        ]);
1051
1052        let stats2 = StatsSet::from_iter([
1053            (Stat::Max, Precision::exact(100)),
1054            (Stat::IsStrictSorted, Precision::exact(true)),
1055        ]);
1056
1057        stats1
1058            .combine_sets(
1059                &stats2,
1060                &DType::Primitive(PType::I32, Nullability::NonNullable),
1061            )
1062            .unwrap();
1063
1064        let stats_ref =
1065            stats1.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
1066
1067        // Min should remain unchanged
1068        assert_eq!(
1069            stats_ref.get_as::<i32>(Stat::Min),
1070            Some(Precision::exact(42))
1071        );
1072        // Max should be added
1073        assert_eq!(
1074            stats_ref.get_as::<i32>(Stat::Max),
1075            Some(Precision::exact(100))
1076        );
1077        // IsStrictSorted should be added
1078        assert_eq!(
1079            stats_ref.get_as::<bool>(Stat::IsStrictSorted),
1080            Some(Precision::exact(true))
1081        );
1082    }
1083
1084    #[test]
1085    fn test_combine_sets_with_inexact() {
1086        let mut stats1 = StatsSet::from_iter([
1087            (Stat::Min, Precision::exact(42)),
1088            (Stat::Max, Precision::inexact(100)),
1089            (Stat::IsConstant, Precision::exact(false)),
1090        ]);
1091
1092        let stats2 = StatsSet::from_iter([
1093            // Must ensure Min from stats2 is <= Min from stats1
1094            (Stat::Min, Precision::inexact(40)),
1095            (Stat::Max, Precision::exact(90)),
1096            (Stat::IsSorted, Precision::exact(true)),
1097        ]);
1098
1099        stats1
1100            .combine_sets(
1101                &stats2,
1102                &DType::Primitive(PType::I32, Nullability::NonNullable),
1103            )
1104            .unwrap();
1105
1106        let stats_ref =
1107            stats1.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
1108
1109        // Min should remain unchanged since it's more restrictive than the inexact value
1110        assert_eq!(
1111            stats_ref.get_as::<i32>(Stat::Min),
1112            Some(Precision::exact(42))
1113        );
1114        // Check that max was updated with the exact value
1115        assert_eq!(
1116            stats_ref.get_as::<i32>(Stat::Max),
1117            Some(Precision::exact(90))
1118        );
1119        // Check that IsSorted was added
1120        assert_eq!(
1121            stats_ref.get_as::<bool>(Stat::IsSorted),
1122            Some(Precision::exact(true))
1123        );
1124    }
1125}