Skip to main content

vortex_array/stats/
stats_set.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5
6use enum_iterator::Sequence;
7use enum_iterator::all;
8use num_traits::CheckedAdd;
9use vortex_dtype::DType;
10use vortex_error::VortexError;
11use vortex_error::VortexExpect;
12use vortex_error::VortexResult;
13use vortex_error::vortex_err;
14use vortex_error::vortex_panic;
15
16use crate::expr::stats::IsConstant;
17use crate::expr::stats::IsSorted;
18use crate::expr::stats::IsStrictSorted;
19use crate::expr::stats::Max;
20use crate::expr::stats::Min;
21use crate::expr::stats::NaNCount;
22use crate::expr::stats::NullCount;
23use crate::expr::stats::Precision;
24use crate::expr::stats::Stat;
25use crate::expr::stats::StatBound;
26use crate::expr::stats::StatType;
27use crate::expr::stats::StatsProvider;
28use crate::expr::stats::StatsProviderExt;
29use crate::expr::stats::Sum;
30use crate::expr::stats::UncompressedSizeInBytes;
31use crate::scalar::Scalar;
32use crate::scalar::ScalarValue;
33
34#[derive(Default, Debug, Clone)]
35pub struct StatsSet {
36    values: Vec<(Stat, Precision<ScalarValue>)>,
37}
38
39impl StatsSet {
40    /// Create new StatSet without validating uniqueness of all the entries
41    ///
42    /// # Safety
43    ///
44    /// This method will not panic or trigger UB, but may lead to duplicate stats being stored.
45    pub unsafe fn new_unchecked(values: Vec<(Stat, Precision<ScalarValue>)>) -> Self {
46        Self { values }
47    }
48
49    /// Create StatsSet from single stat and value
50    pub fn of(stat: Stat, value: Precision<ScalarValue>) -> Self {
51        // SAFETY: No duplicate stats will be set here.
52        unsafe { Self::new_unchecked(vec![(stat, value)]) }
53    }
54
55    fn reserve_full_capacity(&mut self) {
56        if self.values.capacity() < Stat::CARDINALITY {
57            self.values
58                .reserve_exact(Stat::CARDINALITY - self.values.capacity());
59        }
60    }
61
62    /// Wrap stats set with a dtype for mutable typed scalar access
63    pub fn as_mut_typed_ref<'a, 'b>(&'a mut self, dtype: &'b DType) -> MutTypedStatsSetRef<'a, 'b> {
64        MutTypedStatsSetRef {
65            values: self,
66            dtype,
67        }
68    }
69
70    /// Wrap stats set with a dtype for typed scalar access
71    pub fn as_typed_ref<'a, 'b>(&'a self, dtype: &'b DType) -> TypedStatsSetRef<'a, 'b> {
72        TypedStatsSetRef {
73            values: self,
74            dtype,
75        }
76    }
77}
78
79// Getters and setters for individual stats.
80impl StatsSet {
81    /// Set the stat `stat` to `value`.
82    pub fn set(&mut self, stat: Stat, value: Precision<ScalarValue>) {
83        self.reserve_full_capacity();
84
85        if let Some(existing) = self.values.iter_mut().find(|(s, _)| *s == stat) {
86            *existing = (stat, value);
87        } else {
88            self.values.push((stat, value));
89        }
90    }
91
92    /// Clear the stat `stat` from the set.
93    pub fn clear(&mut self, stat: Stat) {
94        self.values.retain(|(s, _)| *s != stat);
95    }
96
97    /// Only keep given stats
98    pub fn retain_only(&mut self, stats: &[Stat]) {
99        self.values.retain(|(s, _)| stats.contains(s));
100    }
101
102    /// Iterate over the statistic names and values in-place.
103    ///
104    /// See [Iterator].
105    pub fn iter(&self) -> impl Iterator<Item = &(Stat, Precision<ScalarValue>)> {
106        self.values.iter()
107    }
108
109    /// Get value for a given stat
110    pub fn get(&self, stat: Stat) -> Option<Precision<ScalarValue>> {
111        self.values
112            .iter()
113            .find(|(s, _)| *s == stat)
114            .map(|(_, v)| v.clone())
115    }
116
117    /// Length of the stats set
118    pub fn len(&self) -> usize {
119        self.values.len()
120    }
121
122    /// Check whether the statset is empty
123    pub fn is_empty(&self) -> bool {
124        self.values.is_empty()
125    }
126
127    /// Get scalar value of a given dtype
128    pub fn get_as<T: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(
129        &self,
130        stat: Stat,
131        dtype: &DType,
132    ) -> Option<Precision<T>> {
133        self.get(stat).map(|v| {
134            v.map(|v| {
135                T::try_from(
136                    &Scalar::try_new(dtype.clone(), Some(v))
137                        .vortex_expect("failed to construct a scalar statistic"),
138                )
139                .unwrap_or_else(|err| {
140                    vortex_panic!(
141                        err,
142                        "Failed to get stat {} as {}",
143                        stat,
144                        std::any::type_name::<T>()
145                    )
146                })
147            })
148        })
149    }
150}
151
152// StatSetIntoIter just exists to protect current implementation from exposure on the public API.
153
154/// Owned iterator over the stats.
155///
156/// See [IntoIterator].
157pub struct StatsSetIntoIter(std::vec::IntoIter<(Stat, Precision<ScalarValue>)>);
158
159impl Iterator for StatsSetIntoIter {
160    type Item = (Stat, Precision<ScalarValue>);
161
162    fn next(&mut self) -> Option<Self::Item> {
163        self.0.next()
164    }
165}
166
167impl IntoIterator for StatsSet {
168    type Item = (Stat, Precision<ScalarValue>);
169    type IntoIter = StatsSetIntoIter;
170
171    fn into_iter(self) -> Self::IntoIter {
172        StatsSetIntoIter(self.values.into_iter())
173    }
174}
175
176impl FromIterator<(Stat, Precision<ScalarValue>)> for StatsSet {
177    fn from_iter<T: IntoIterator<Item = (Stat, Precision<ScalarValue>)>>(iter: T) -> Self {
178        let iter = iter.into_iter();
179        let mut values = Vec::default();
180        values.reserve_exact(Stat::CARDINALITY);
181
182        let mut this = Self { values };
183        this.extend(iter);
184        this
185    }
186}
187
188impl Extend<(Stat, Precision<ScalarValue>)> for StatsSet {
189    #[inline]
190    fn extend<T: IntoIterator<Item = (Stat, Precision<ScalarValue>)>>(&mut self, iter: T) {
191        let iter = iter.into_iter();
192        self.reserve_full_capacity();
193
194        iter.for_each(|(stat, value)| self.set(stat, value));
195    }
196}
197
198/// Merge helpers
199impl StatsSet {
200    /// Merge stats set `other` into `self`, with the semantic assumption that `other`
201    /// contains stats from a disjoint array that is *appended* to the array represented by `self`.
202    pub fn merge_ordered(mut self, other: &Self, dtype: &DType) -> Self {
203        self.as_mut_typed_ref(dtype)
204            .merge_ordered(&other.as_typed_ref(dtype));
205        self
206    }
207
208    /// Merge stats set `other` into `self`, from a disjoint array, with no ordering assumptions.
209    /// Stats that are not commutative (e.g., is_sorted) are dropped from the result.
210    pub fn merge_unordered(mut self, other: &Self, dtype: &DType) -> Self {
211        self.as_mut_typed_ref(dtype)
212            .merge_unordered(&other.as_typed_ref(dtype));
213        self
214    }
215
216    /// Given two sets of stats (of differing precision) for the same array, combine them
217    pub fn combine_sets(&mut self, other: &Self, dtype: &DType) -> VortexResult<()> {
218        self.as_mut_typed_ref(dtype)
219            .combine_sets(&other.as_typed_ref(dtype))
220    }
221}
222
223pub struct TypedStatsSetRef<'a, 'b> {
224    pub values: &'a StatsSet,
225    pub dtype: &'b DType,
226}
227
228impl StatsProvider for TypedStatsSetRef<'_, '_> {
229    fn get(&self, stat: Stat) -> Option<Precision<Scalar>> {
230        self.values.get(stat).map(|p| {
231            p.map(|sv| {
232                Scalar::try_new(
233                    stat.dtype(self.dtype)
234                        .vortex_expect("Must have valid dtype if value is present"),
235                    Some(sv),
236                )
237                .vortex_expect("failed to construct a scalar statistic")
238            })
239        })
240    }
241
242    fn len(&self) -> usize {
243        self.values.len()
244    }
245}
246
247pub struct MutTypedStatsSetRef<'a, 'b> {
248    pub values: &'a mut StatsSet,
249    pub dtype: &'b DType,
250}
251
252impl MutTypedStatsSetRef<'_, '_> {
253    /// Set the stat `stat` to `value`.
254    pub fn set(&mut self, stat: Stat, value: Precision<ScalarValue>) {
255        self.values.set(stat, value);
256    }
257
258    /// Clear the stat `stat` from the set.
259    pub fn clear(&mut self, stat: Stat) {
260        self.values.clear(stat);
261    }
262}
263
264impl StatsProvider for MutTypedStatsSetRef<'_, '_> {
265    fn get(&self, stat: Stat) -> Option<Precision<Scalar>> {
266        self.values.get(stat).map(|p| {
267            p.map(|sv| {
268                Scalar::try_new(
269                    stat.dtype(self.dtype)
270                        .vortex_expect("Must have valid dtype if value is present"),
271                    Some(sv),
272                )
273                .vortex_expect("failed to construct a scalar statistic")
274            })
275        })
276    }
277
278    fn len(&self) -> usize {
279        self.values.len()
280    }
281}
282
283// Merge helpers
284impl MutTypedStatsSetRef<'_, '_> {
285    /// Merge stats set `other` into `self`, with the semantic assumption that `other`
286    /// contains stats from a disjoint array that is *appended* to the array represented by `self`.
287    pub fn merge_ordered(mut self, other: &TypedStatsSetRef) -> Self {
288        for s in all::<Stat>() {
289            match s {
290                Stat::IsConstant => self.merge_is_constant(other),
291                Stat::IsSorted => self.merge_is_sorted(other),
292                Stat::IsStrictSorted => self.merge_is_strict_sorted(other),
293                Stat::Max => self.merge_max(other),
294                Stat::Min => self.merge_min(other),
295                Stat::Sum => self.merge_sum(other),
296                Stat::NullCount => self.merge_null_count(other),
297                Stat::UncompressedSizeInBytes => self.merge_uncompressed_size_in_bytes(other),
298                Stat::NaNCount => self.merge_nan_count(other),
299            }
300        }
301
302        self
303    }
304
305    /// Merge stats set `other` into `self`, from a disjoint array, with no ordering assumptions.
306    /// Stats that are not commutative (e.g., is_sorted) are dropped from the result.
307    pub fn merge_unordered(mut self, other: &TypedStatsSetRef) -> Self {
308        for s in all::<Stat>() {
309            if !s.is_commutative() {
310                self.clear(s);
311                continue;
312            }
313
314            match s {
315                Stat::IsConstant => self.merge_is_constant(other),
316                Stat::Max => self.merge_max(other),
317                Stat::Min => self.merge_min(other),
318                Stat::Sum => self.merge_sum(other),
319                Stat::NullCount => self.merge_null_count(other),
320                Stat::UncompressedSizeInBytes => self.merge_uncompressed_size_in_bytes(other),
321                Stat::IsSorted | Stat::IsStrictSorted => {
322                    unreachable!("not commutative")
323                }
324                Stat::NaNCount => self.merge_nan_count(other),
325            }
326        }
327
328        self
329    }
330
331    /// Given two sets of stats (of differing precision) for the same array, combine them
332    pub fn combine_sets(&mut self, other: &TypedStatsSetRef) -> VortexResult<()> {
333        let other_stats: Vec<_> = other.values.iter().map(|(stat, _)| *stat).collect();
334        for s in other_stats {
335            match s {
336                Stat::Max => self.combine_bound::<Max>(other)?,
337                Stat::Min => self.combine_bound::<Min>(other)?,
338                Stat::UncompressedSizeInBytes => {
339                    self.combine_bound::<UncompressedSizeInBytes>(other)?
340                }
341                Stat::IsConstant => self.combine_bool_stat::<IsConstant>(other)?,
342                Stat::IsSorted => self.combine_bool_stat::<IsSorted>(other)?,
343                Stat::IsStrictSorted => self.combine_bool_stat::<IsStrictSorted>(other)?,
344                Stat::NullCount => self.combine_bound::<NullCount>(other)?,
345                Stat::Sum => self.combine_bound::<Sum>(other)?,
346                Stat::NaNCount => self.combine_bound::<NaNCount>(other)?,
347            }
348        }
349        Ok(())
350    }
351
352    fn combine_bound<S: StatType<Scalar>>(&mut self, other: &TypedStatsSetRef) -> VortexResult<()>
353    where
354        S::Bound: StatBound<Scalar> + Debug + Eq + PartialEq,
355    {
356        match (self.get_scalar_bound::<S>(), other.get_scalar_bound::<S>()) {
357            (Some(m1), Some(m2)) => {
358                let meet = m1
359                    .intersection(&m2)
360                    .vortex_expect("can always compare scalar")
361                    .ok_or_else(|| {
362                        vortex_err!("{:?} bounds ({m1:?}, {m2:?}) do not overlap", S::STAT)
363                    })?;
364                if meet != m1 {
365                    self.set(
366                        S::STAT,
367                        meet.into_value().map(|s| {
368                            s.into_value()
369                                .vortex_expect("stat scalar value cannot be null")
370                        }),
371                    );
372                }
373            }
374            (None, Some(m)) => self.set(
375                S::STAT,
376                m.into_value().map(|s| {
377                    s.into_value()
378                        .vortex_expect("stat scalar value cannot be null")
379                }),
380            ),
381            (Some(_), _) => (),
382            (None, None) => self.clear(S::STAT),
383        }
384        Ok(())
385    }
386
387    fn combine_bool_stat<S: StatType<bool>>(&mut self, other: &TypedStatsSetRef) -> VortexResult<()>
388    where
389        S::Bound: StatBound<bool> + Debug + Eq + PartialEq,
390    {
391        match (
392            self.get_as_bound::<S, bool>(),
393            other.get_as_bound::<S, bool>(),
394        ) {
395            (Some(m1), Some(m2)) => {
396                let intersection = m1
397                    .intersection(&m2)
398                    .vortex_expect("can always compare boolean")
399                    .ok_or_else(|| {
400                        vortex_err!("{:?} bounds ({m1:?}, {m2:?}) do not overlap", S::STAT)
401                    })?;
402                if intersection != m1 {
403                    self.set(S::STAT, intersection.into_value().map(ScalarValue::from));
404                }
405            }
406            (None, Some(m)) => self.set(S::STAT, m.into_value().map(ScalarValue::from)),
407            (Some(_), None) => (),
408            (None, None) => self.clear(S::STAT),
409        }
410        Ok(())
411    }
412
413    fn merge_min(&mut self, other: &TypedStatsSetRef) {
414        match (
415            self.get_scalar_bound::<Min>(),
416            other.get_scalar_bound::<Min>(),
417        ) {
418            (Some(m1), Some(m2)) => {
419                let meet = m1.union(&m2).vortex_expect("can compare scalar");
420                if meet != m1 {
421                    self.set(
422                        Stat::Min,
423                        meet.into_value().map(|s| {
424                            s.into_value()
425                                .vortex_expect("stat scalar value cannot be null")
426                        }),
427                    );
428                }
429            }
430            _ => self.clear(Stat::Min),
431        }
432    }
433
434    fn merge_max(&mut self, other: &TypedStatsSetRef) {
435        match (
436            self.get_scalar_bound::<Max>(),
437            other.get_scalar_bound::<Max>(),
438        ) {
439            (Some(m1), Some(m2)) => {
440                let meet = m1.union(&m2).vortex_expect("can compare scalar");
441                if meet != m1 {
442                    self.set(
443                        Stat::Max,
444                        meet.into_value().map(|s| {
445                            s.into_value()
446                                .vortex_expect("stat scalar value cannot be null")
447                        }),
448                    );
449                }
450            }
451            _ => self.clear(Stat::Max),
452        }
453    }
454
455    fn merge_sum(&mut self, other: &TypedStatsSetRef) {
456        match (
457            self.get_scalar_bound::<Sum>(),
458            other.get_scalar_bound::<Sum>(),
459        ) {
460            (Some(m1), Some(m2)) => {
461                // If the combine sum is exact, then we can sum them.
462                if let Some(scalar_value) = m1.zip(m2).as_exact().and_then(|(s1, s2)| {
463                    s1.as_primitive()
464                        .checked_add(&s2.as_primitive())
465                        .and_then(|pscalar| pscalar.pvalue().map(ScalarValue::Primitive))
466                }) {
467                    self.set(Stat::Sum, Precision::Exact(scalar_value));
468                }
469            }
470            _ => self.clear(Stat::Sum),
471        }
472    }
473
474    fn merge_is_constant(&mut self, other: &TypedStatsSetRef) {
475        let self_const = self.get_as(Stat::IsConstant);
476        let other_const = other.get_as(Stat::IsConstant);
477        let self_min = self.get(Stat::Min);
478        let other_min = other.get(Stat::Min);
479
480        if let (
481            Some(Precision::Exact(self_const)),
482            Some(Precision::Exact(other_const)),
483            Some(Precision::Exact(self_min)),
484            Some(Precision::Exact(other_min)),
485        ) = (self_const, other_const, self_min, other_min)
486        {
487            if self_const && other_const && self_min == other_min {
488                self.set(Stat::IsConstant, Precision::exact(true));
489            } else {
490                self.set(Stat::IsConstant, Precision::inexact(false));
491            }
492        }
493        self.set(Stat::IsConstant, Precision::exact(false));
494    }
495
496    fn merge_is_sorted(&mut self, other: &TypedStatsSetRef) {
497        self.merge_sortedness_stat(other, Stat::IsSorted, PartialOrd::le)
498    }
499
500    fn merge_is_strict_sorted(&mut self, other: &TypedStatsSetRef) {
501        self.merge_sortedness_stat(other, Stat::IsStrictSorted, PartialOrd::lt)
502    }
503
504    fn merge_sortedness_stat<F: Fn(&Scalar, &Scalar) -> bool>(
505        &mut self,
506        other: &TypedStatsSetRef,
507        stat: Stat,
508        cmp: F,
509    ) {
510        if (Some(Precision::Exact(true)), Some(Precision::Exact(true)))
511            == (self.get_as(stat), other.get_as(stat))
512        {
513            // There might be no stat because it was dropped, or it doesn't exist
514            // (e.g. an all null array).
515            // We assume that it was the dropped case since the doesn't exist might imply sorted,
516            // but this in-precision is correct.
517            if let (Some(self_max), Some(other_min)) = (
518                self.get_scalar_bound::<Max>(),
519                other.get_scalar_bound::<Min>(),
520            ) {
521                return if cmp(&self_max.max_value(), &other_min.min_value()) {
522                    // keep value
523                } else {
524                    self.set(stat, Precision::inexact(false));
525                };
526            }
527        }
528        self.clear(stat);
529    }
530
531    fn merge_null_count(&mut self, other: &TypedStatsSetRef) {
532        self.merge_sum_stat(Stat::NullCount, other)
533    }
534
535    fn merge_nan_count(&mut self, other: &TypedStatsSetRef) {
536        self.merge_sum_stat(Stat::NaNCount, other)
537    }
538
539    fn merge_uncompressed_size_in_bytes(&mut self, other: &TypedStatsSetRef) {
540        self.merge_sum_stat(Stat::UncompressedSizeInBytes, other)
541    }
542
543    fn merge_sum_stat(&mut self, stat: Stat, other: &TypedStatsSetRef) {
544        match (self.get_as::<usize>(stat), other.get_as::<usize>(stat)) {
545            (Some(nc1), Some(nc2)) => {
546                self.set(
547                    stat,
548                    nc1.zip(nc2).map(|(nc1, nc2)| ScalarValue::from(nc1 + nc2)),
549                );
550            }
551            _ => self.clear(stat),
552        }
553    }
554}
555
556#[cfg(test)]
557mod test {
558    use enum_iterator::all;
559    use itertools::Itertools;
560    use vortex_dtype::DType;
561    use vortex_dtype::Nullability;
562    use vortex_dtype::PType;
563
564    use crate::arrays::PrimitiveArray;
565    use crate::expr::stats::IsConstant;
566    use crate::expr::stats::Precision;
567    use crate::expr::stats::Stat;
568    use crate::expr::stats::StatsProvider;
569    use crate::expr::stats::StatsProviderExt;
570    use crate::stats::StatsSet;
571    use crate::stats::stats_set::Scalar;
572
573    #[test]
574    fn test_iter() {
575        // SAFETY: No duplicate stats.
576        let set = unsafe {
577            StatsSet::new_unchecked(vec![
578                (Stat::Max, Precision::exact(100)),
579                (Stat::Min, Precision::exact(42)),
580            ])
581        };
582        let mut iter = set.iter();
583        let first = iter.next().unwrap().clone();
584        assert_eq!(first.0, Stat::Max);
585        assert_eq!(
586            first.1.map(
587                |f| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(f)).unwrap()).unwrap()
588            ),
589            Precision::exact(100)
590        );
591        let snd = iter.next().unwrap().clone();
592        assert_eq!(snd.0, Stat::Min);
593        assert_eq!(
594            snd.1.map(
595                |s| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(s)).unwrap()).unwrap()
596            ),
597            Precision::exact(42)
598        );
599    }
600
601    #[test]
602    fn into_iter() {
603        // SAFETY: No duplicate stats.
604        let mut set = unsafe {
605            StatsSet::new_unchecked(vec![
606                (Stat::Max, Precision::exact(100)),
607                (Stat::Min, Precision::exact(42)),
608            ])
609        }
610        .into_iter();
611        let (stat, first) = set.next().unwrap();
612        assert_eq!(stat, Stat::Max);
613        assert_eq!(
614            first.map(
615                |f| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(f)).unwrap()).unwrap()
616            ),
617            Precision::exact(100)
618        );
619        let snd = set.next().unwrap();
620        assert_eq!(snd.0, Stat::Min);
621        assert_eq!(
622            snd.1.map(
623                |s| i32::try_from(&Scalar::try_new(PType::I32.into(), Some(s)).unwrap()).unwrap()
624            ),
625            Precision::exact(42)
626        );
627    }
628
629    #[test]
630    fn merge_constant() {
631        let first = StatsSet::from_iter([
632            (Stat::Min, Precision::exact(42)),
633            (Stat::IsConstant, Precision::exact(true)),
634        ])
635        .merge_ordered(
636            &StatsSet::from_iter([
637                (Stat::Min, Precision::inexact(42)),
638                (Stat::IsConstant, Precision::exact(true)),
639            ]),
640            &DType::Primitive(PType::I32, Nullability::NonNullable),
641        );
642
643        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
644        assert_eq!(
645            first_ref.get_as::<bool>(Stat::IsConstant),
646            Some(Precision::exact(false))
647        );
648        assert_eq!(
649            first_ref.get_as::<i32>(Stat::Min),
650            Some(Precision::exact(42))
651        );
652    }
653
654    #[test]
655    fn merge_into_min() {
656        let first = StatsSet::of(Stat::Min, Precision::exact(42)).merge_ordered(
657            &StatsSet::default(),
658            &DType::Primitive(PType::I32, Nullability::NonNullable),
659        );
660
661        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
662        assert!(first_ref.get(Stat::Min).is_none());
663    }
664
665    #[test]
666    fn merge_from_min() {
667        let first = StatsSet::default().merge_ordered(
668            &StatsSet::of(Stat::Min, Precision::exact(42)),
669            &DType::Primitive(PType::I32, Nullability::NonNullable),
670        );
671
672        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
673        assert!(first_ref.get(Stat::Min).is_none());
674    }
675
676    #[test]
677    fn merge_mins() {
678        let first = StatsSet::of(Stat::Min, Precision::exact(37)).merge_ordered(
679            &StatsSet::of(Stat::Min, Precision::exact(42)),
680            &DType::Primitive(PType::I32, Nullability::NonNullable),
681        );
682
683        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
684        assert_eq!(
685            first_ref.get_as::<i32>(Stat::Min),
686            Some(Precision::exact(37))
687        );
688    }
689
690    #[test]
691    fn merge_into_bound_max() {
692        let first = StatsSet::of(Stat::Max, Precision::exact(42)).merge_ordered(
693            &StatsSet::default(),
694            &DType::Primitive(PType::I32, Nullability::NonNullable),
695        );
696        assert!(first.get(Stat::Max).is_none());
697    }
698
699    #[test]
700    fn merge_from_max() {
701        let first = StatsSet::default().merge_ordered(
702            &StatsSet::of(Stat::Max, Precision::exact(42)),
703            &DType::Primitive(PType::I32, Nullability::NonNullable),
704        );
705        assert!(first.get(Stat::Max).is_none());
706    }
707
708    #[test]
709    fn merge_maxes() {
710        let first = StatsSet::of(Stat::Max, Precision::exact(37)).merge_ordered(
711            &StatsSet::of(Stat::Max, Precision::exact(42)),
712            &DType::Primitive(PType::I32, Nullability::NonNullable),
713        );
714        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
715        assert_eq!(
716            first_ref.get_as::<i32>(Stat::Max),
717            Some(Precision::exact(42))
718        );
719    }
720
721    #[test]
722    fn merge_maxes_bound() {
723        let dtype = DType::Primitive(PType::I32, Nullability::NonNullable);
724        let first = StatsSet::of(Stat::Max, Precision::exact(42i32))
725            .merge_ordered(&StatsSet::of(Stat::Max, Precision::inexact(43i32)), &dtype);
726        let first_ref = first.as_typed_ref(&dtype);
727        assert_eq!(
728            first_ref.get_as::<i32>(Stat::Max),
729            Some(Precision::inexact(43))
730        );
731    }
732
733    #[test]
734    fn merge_into_scalar() {
735        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
736        // for unsigned, f64 for floats).
737        let first = StatsSet::of(Stat::Sum, Precision::exact(42i64)).merge_ordered(
738            &StatsSet::default(),
739            &DType::Primitive(PType::I32, Nullability::NonNullable),
740        );
741        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
742        assert!(first_ref.get(Stat::Sum).is_none());
743    }
744
745    #[test]
746    fn merge_from_scalar() {
747        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
748        // for unsigned, f64 for floats).
749        let first = StatsSet::default().merge_ordered(
750            &StatsSet::of(Stat::Sum, Precision::exact(42i64)),
751            &DType::Primitive(PType::I32, Nullability::NonNullable),
752        );
753        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
754        assert!(first_ref.get(Stat::Sum).is_none());
755    }
756
757    #[test]
758    fn merge_scalars() {
759        // Sum stats for primitive types are always the 64-bit version (i64 for signed, u64
760        // for unsigned, f64 for floats).
761        let first = StatsSet::of(Stat::Sum, Precision::exact(37i64)).merge_ordered(
762            &StatsSet::of(Stat::Sum, Precision::exact(42i64)),
763            &DType::Primitive(PType::I32, Nullability::NonNullable),
764        );
765        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
766        assert_eq!(
767            first_ref.get_as::<i64>(Stat::Sum),
768            Some(Precision::exact(79i64))
769        );
770    }
771
772    #[test]
773    fn merge_into_sortedness() {
774        let first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true)).merge_ordered(
775            &StatsSet::default(),
776            &DType::Primitive(PType::I32, Nullability::NonNullable),
777        );
778        assert!(first.get(Stat::IsStrictSorted).is_none());
779    }
780
781    #[test]
782    fn merge_from_sortedness() {
783        let first = StatsSet::default().merge_ordered(
784            &StatsSet::of(Stat::IsStrictSorted, Precision::exact(true)),
785            &DType::Primitive(PType::I32, Nullability::NonNullable),
786        );
787        assert!(first.get(Stat::IsStrictSorted).is_none());
788    }
789
790    #[test]
791    fn merge_sortedness() {
792        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
793        first.set(Stat::Max, Precision::exact(1));
794        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
795        second.set(Stat::Min, Precision::exact(2));
796        first = first.merge_ordered(
797            &second,
798            &DType::Primitive(PType::I32, Nullability::NonNullable),
799        );
800
801        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
802        assert_eq!(
803            first_ref.get_as::<bool>(Stat::IsStrictSorted),
804            Some(Precision::exact(true))
805        );
806    }
807
808    #[test]
809    fn merge_sortedness_out_of_order() {
810        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
811        first.set(Stat::Min, Precision::exact(1));
812        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
813        second.set(Stat::Max, Precision::exact(2));
814        second = second.merge_ordered(
815            &first,
816            &DType::Primitive(PType::I32, Nullability::NonNullable),
817        );
818
819        let second_ref =
820            second.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
821        assert_eq!(
822            second_ref.get_as::<bool>(Stat::IsStrictSorted),
823            Some(Precision::inexact(false))
824        );
825    }
826
827    #[test]
828    fn merge_sortedness_only_one_sorted() {
829        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
830        first.set(Stat::Max, Precision::exact(1));
831        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(false));
832        second.set(Stat::Min, Precision::exact(2));
833        first.merge_ordered(
834            &second,
835            &DType::Primitive(PType::I32, Nullability::NonNullable),
836        );
837
838        let second_ref =
839            second.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
840        assert_eq!(
841            second_ref.get_as::<bool>(Stat::IsStrictSorted),
842            Some(Precision::exact(false))
843        );
844    }
845
846    #[test]
847    fn merge_sortedness_missing_min() {
848        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
849        first.set(Stat::Max, Precision::exact(1));
850        let second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
851        first = first.merge_ordered(
852            &second,
853            &DType::Primitive(PType::I32, Nullability::NonNullable),
854        );
855        assert!(first.get(Stat::IsStrictSorted).is_none());
856    }
857
858    #[test]
859    fn merge_sortedness_bound_min() {
860        let mut first = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
861        first.set(Stat::Max, Precision::exact(1));
862        let mut second = StatsSet::of(Stat::IsStrictSorted, Precision::exact(true));
863        second.set(Stat::Min, Precision::inexact(2));
864        first = first.merge_ordered(
865            &second,
866            &DType::Primitive(PType::I32, Nullability::NonNullable),
867        );
868
869        let first_ref = first.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
870        assert_eq!(
871            first_ref.get_as::<bool>(Stat::IsStrictSorted),
872            Some(Precision::exact(true))
873        );
874    }
875
876    #[test]
877    fn merge_unordered() {
878        let array =
879            PrimitiveArray::from_option_iter([Some(1), None, Some(2), Some(42), Some(10000), None]);
880        let all_stats = all::<Stat>()
881            .filter(|s| !matches!(s, Stat::Sum))
882            .filter(|s| !matches!(s, Stat::NaNCount))
883            .collect_vec();
884        array.statistics().compute_all(&all_stats).unwrap();
885
886        let stats = array.statistics().to_owned();
887        for stat in &all_stats {
888            assert!(stats.get(*stat).is_some(), "Stat {stat} is missing");
889        }
890
891        let merged = stats.clone().merge_unordered(
892            &stats,
893            &DType::Primitive(PType::I32, Nullability::NonNullable),
894        );
895        for stat in &all_stats {
896            assert_eq!(
897                merged.get(*stat).is_some(),
898                stat.is_commutative(),
899                "Stat {stat} remains after merge_unordered despite not being commutative, or was removed despite being commutative"
900            )
901        }
902
903        let merged_ref = merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::Nullable));
904        let stats_ref = stats.as_typed_ref(&DType::Primitive(PType::I32, Nullability::Nullable));
905
906        assert_eq!(
907            merged_ref.get_as::<i32>(Stat::Min),
908            stats_ref.get_as::<i32>(Stat::Min)
909        );
910        assert_eq!(
911            merged_ref.get_as::<i32>(Stat::Max),
912            stats_ref.get_as::<i32>(Stat::Max)
913        );
914        assert_eq!(
915            merged_ref.get_as::<u64>(Stat::NullCount).unwrap(),
916            stats_ref
917                .get_as::<u64>(Stat::NullCount)
918                .unwrap()
919                .map(|s| s * 2)
920        );
921    }
922
923    #[test]
924    fn merge_min_bound_same() {
925        // Merging a stat with a bound and another with an exact results in exact stat.
926        // since bound for min is a lower bound, it can in fact contain any value >= bound.
927        let merged = StatsSet::of(Stat::Min, Precision::inexact(5)).merge_ordered(
928            &StatsSet::of(Stat::Min, Precision::exact(5)),
929            &DType::Primitive(PType::I32, Nullability::NonNullable),
930        );
931        let merged_ref =
932            merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
933        assert_eq!(
934            merged_ref.get_as::<i32>(Stat::Min),
935            Some(Precision::exact(5))
936        );
937    }
938
939    #[test]
940    fn merge_min_bound_bound_lower() {
941        let merged = StatsSet::of(Stat::Min, Precision::inexact(4)).merge_ordered(
942            &StatsSet::of(Stat::Min, Precision::exact(5)),
943            &DType::Primitive(PType::I32, Nullability::NonNullable),
944        );
945        let merged_ref =
946            merged.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
947        assert_eq!(
948            merged_ref.get_as::<i32>(Stat::Min),
949            Some(Precision::inexact(4))
950        );
951    }
952
953    #[test]
954    fn test_combine_is_constant() {
955        {
956            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(true));
957            let stats2 = StatsSet::of(Stat::IsConstant, Precision::exact(true));
958            let mut stats_ref =
959                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
960            stats_ref
961                .combine_bool_stat::<IsConstant>(
962                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
963                )
964                .unwrap();
965            assert_eq!(
966                stats_ref.get_as::<bool>(Stat::IsConstant),
967                Some(Precision::exact(true))
968            );
969        }
970
971        {
972            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(true));
973            let stats2 = StatsSet::of(Stat::IsConstant, Precision::inexact(false));
974            let mut stats_ref =
975                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
976            stats_ref
977                .combine_bool_stat::<IsConstant>(
978                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
979                )
980                .unwrap();
981            assert_eq!(
982                stats_ref.get_as::<bool>(Stat::IsConstant),
983                Some(Precision::exact(true))
984            );
985        }
986
987        {
988            let mut stats = StatsSet::of(Stat::IsConstant, Precision::exact(false));
989            let stats2 = StatsSet::of(Stat::IsConstant, Precision::inexact(false));
990            let mut stats_ref =
991                stats.as_mut_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
992            stats_ref
993                .combine_bool_stat::<IsConstant>(
994                    &stats2.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable)),
995                )
996                .unwrap();
997            assert_eq!(
998                stats_ref.get_as::<bool>(Stat::IsConstant),
999                Some(Precision::exact(false))
1000            );
1001        }
1002    }
1003
1004    #[test]
1005    fn test_combine_sets_boolean_conflict() {
1006        let mut stats1 = StatsSet::from_iter([
1007            (Stat::IsConstant, Precision::exact(true)),
1008            (Stat::IsSorted, Precision::exact(true)),
1009        ]);
1010
1011        let stats2 = StatsSet::from_iter([
1012            (Stat::IsConstant, Precision::exact(false)),
1013            (Stat::IsSorted, Precision::exact(true)),
1014        ]);
1015
1016        let result = stats1.combine_sets(
1017            &stats2,
1018            &DType::Primitive(PType::I32, Nullability::NonNullable),
1019        );
1020        assert!(result.is_err());
1021    }
1022
1023    #[test]
1024    fn test_combine_sets_with_missing_stats() {
1025        let mut stats1 = StatsSet::from_iter([
1026            (Stat::Min, Precision::exact(42)),
1027            (Stat::UncompressedSizeInBytes, Precision::exact(1000)),
1028        ]);
1029
1030        let stats2 = StatsSet::from_iter([
1031            (Stat::Max, Precision::exact(100)),
1032            (Stat::IsStrictSorted, Precision::exact(true)),
1033        ]);
1034
1035        stats1
1036            .combine_sets(
1037                &stats2,
1038                &DType::Primitive(PType::I32, Nullability::NonNullable),
1039            )
1040            .unwrap();
1041
1042        let stats_ref =
1043            stats1.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
1044
1045        // Min should remain unchanged
1046        assert_eq!(
1047            stats_ref.get_as::<i32>(Stat::Min),
1048            Some(Precision::exact(42))
1049        );
1050        // Max should be added
1051        assert_eq!(
1052            stats_ref.get_as::<i32>(Stat::Max),
1053            Some(Precision::exact(100))
1054        );
1055        // IsStrictSorted should be added
1056        assert_eq!(
1057            stats_ref.get_as::<bool>(Stat::IsStrictSorted),
1058            Some(Precision::exact(true))
1059        );
1060    }
1061
1062    #[test]
1063    fn test_combine_sets_with_inexact() {
1064        let mut stats1 = StatsSet::from_iter([
1065            (Stat::Min, Precision::exact(42)),
1066            (Stat::Max, Precision::inexact(100)),
1067            (Stat::IsConstant, Precision::exact(false)),
1068        ]);
1069
1070        let stats2 = StatsSet::from_iter([
1071            // Must ensure Min from stats2 is <= Min from stats1
1072            (Stat::Min, Precision::inexact(40)),
1073            (Stat::Max, Precision::exact(90)),
1074            (Stat::IsSorted, Precision::exact(true)),
1075        ]);
1076
1077        stats1
1078            .combine_sets(
1079                &stats2,
1080                &DType::Primitive(PType::I32, Nullability::NonNullable),
1081            )
1082            .unwrap();
1083
1084        let stats_ref =
1085            stats1.as_typed_ref(&DType::Primitive(PType::I32, Nullability::NonNullable));
1086
1087        // Min should remain unchanged since it's more restrictive than the inexact value
1088        assert_eq!(
1089            stats_ref.get_as::<i32>(Stat::Min),
1090            Some(Precision::exact(42))
1091        );
1092        // Check that max was updated with the exact value
1093        assert_eq!(
1094            stats_ref.get_as::<i32>(Stat::Max),
1095            Some(Precision::exact(90))
1096        );
1097        // Check that IsSorted was added
1098        assert_eq!(
1099            stats_ref.get_as::<bool>(Stat::IsSorted),
1100            Some(Precision::exact(true))
1101        );
1102    }
1103}