vortex_layout/layouts/stats/
stats_table.rs

1use std::sync::Arc;
2
3use itertools::Itertools;
4use vortex_array::arrays::StructArray;
5use vortex_array::builders::{ArrayBuilder, ArrayBuilderExt, builder_with_capacity};
6use vortex_array::compute::sum;
7use vortex_array::stats::{Precision, Stat, StatsSet};
8use vortex_array::validity::Validity;
9use vortex_array::{Array, ArrayRef, ArrayVariants};
10use vortex_dtype::{DType, Nullability, PType, StructDType};
11use vortex_error::{VortexExpect, VortexResult, vortex_bail};
12
13/// A table of statistics for a column.
14/// Each row of the stats table corresponds to a chunk of the column.
15///
16/// Note that it's possible for the stats table to have no statistics.
17#[derive(Clone)]
18pub struct StatsTable {
19    // The struct array backing the stats table
20    array: ArrayRef,
21    // The statistics that are included in the table.
22    stats: Arc<[Stat]>,
23}
24
25impl StatsTable {
26    /// Create StatsTable of given column_dtype from given array. Validates that the array matches expected
27    /// structure for given list of stats
28    pub fn try_new(column_dtype: DType, array: ArrayRef, stats: Arc<[Stat]>) -> VortexResult<Self> {
29        if &Self::dtype_for_stats_table(&column_dtype, &stats) != array.dtype() {
30            vortex_bail!("Array dtype does not match expected stats table dtype");
31        }
32        Ok(Self::unchecked_new(array, stats))
33    }
34
35    /// Create StatsTable without validating return array against expected stats
36    pub fn unchecked_new(array: ArrayRef, stats: Arc<[Stat]>) -> Self {
37        Self { array, stats }
38    }
39
40    /// Returns the DType of the statistics table given a set of statistics and column [`DType`].
41    pub fn dtype_for_stats_table(column_dtype: &DType, present_stats: &[Stat]) -> DType {
42        assert!(present_stats.is_sorted(), "Stats must be sorted");
43        DType::Struct(
44            Arc::new(StructDType::from_iter(present_stats.iter().filter_map(
45                |stat| {
46                    stat.dtype(column_dtype)
47                        .map(|dtype| (stat.name(), dtype.as_nullable()))
48                },
49            ))),
50            Nullability::NonNullable,
51        )
52    }
53
54    /// The struct array backing the stats table
55    pub fn array(&self) -> &ArrayRef {
56        &self.array
57    }
58
59    /// The statistics that are included in the table.
60    pub fn present_stats(&self) -> &[Stat] {
61        &self.stats
62    }
63
64    /// Return an aggregated stats set for the table.
65    pub fn to_stats_set(&self, stats: &[Stat]) -> VortexResult<StatsSet> {
66        let mut stats_set = StatsSet::default();
67        for stat in stats {
68            let Some(array) = self.get_stat(*stat)? else {
69                continue;
70            };
71
72            // Different stats need different aggregations
73            match stat {
74                // For stats that are associative, we can just compute them over the stat column
75                Stat::Min | Stat::Max | Stat::Sum => {
76                    if let Some(s) = array.statistics().compute_stat(*stat)? {
77                        stats_set.set(*stat, Precision::exact(s))
78                    }
79                }
80                // These stats sum up
81                Stat::NullCount | Stat::UncompressedSizeInBytes => {
82                    let sum = sum(&array)?
83                        .cast(&DType::Primitive(PType::U64, Nullability::Nullable))?
84                        .into_value();
85                    stats_set.set(*stat, Precision::exact(sum));
86                }
87                // We could implement these aggregations in the future, but for now they're unused
88                Stat::IsConstant | Stat::IsSorted | Stat::IsStrictSorted => {}
89            }
90        }
91        Ok(stats_set)
92    }
93
94    /// Return the array for a given stat.
95    pub fn get_stat(&self, stat: Stat) -> VortexResult<Option<ArrayRef>> {
96        Ok(self
97            .array
98            .as_struct_typed()
99            .vortex_expect("Stats table must be a struct array")
100            .maybe_null_field_by_name(stat.name())
101            .ok())
102    }
103}
104
105/// Accumulates statistics for a column.
106///
107/// TODO(ngates): we should make it such that the stats table stores a mirror of the DType
108///  underneath each stats column. For example, `min: i32` for an `i32` array.
109///  Or `min: {a: i32, b: i32}` for a struct array of type `{a: i32, b: i32}`.
110///  See: <https://github.com/spiraldb/vortex/issues/1835>
111pub struct StatsAccumulator {
112    stats: Arc<[Stat]>,
113    builders: Vec<Box<dyn ArrayBuilder>>,
114    length: usize,
115}
116
117impl StatsAccumulator {
118    pub fn new(dtype: DType, stats: &[Stat]) -> Self {
119        let (stats, builders): (Vec<Stat>, _) = stats
120            .iter()
121            .filter_map(|s| {
122                s.dtype(&dtype)
123                    .map(|stat_dtype| (*s, builder_with_capacity(&stat_dtype.as_nullable(), 1024)))
124            })
125            .unzip();
126
127        Self {
128            stats: stats.into(),
129            builders,
130            length: 0,
131        }
132    }
133
134    pub fn stats(&self) -> &[Stat] {
135        &self.stats
136    }
137
138    pub fn push_chunk(&mut self, array: &dyn Array) -> VortexResult<()> {
139        for (s, builder) in self.stats.iter().zip_eq(self.builders.iter_mut()) {
140            if let Some(v) = array.statistics().compute_stat(*s)? {
141                builder.append_scalar_value(v)?;
142            } else {
143                builder.append_null();
144            }
145        }
146        self.length += 1;
147        Ok(())
148    }
149
150    /// Finishes the accumulator into a [`StatsTable`].
151    ///
152    /// Returns `None` if none of the requested statistics can be computed, for example they are
153    /// not applicable to the column's data type.
154    pub fn as_stats_table(&mut self) -> Option<StatsTable> {
155        let mut names = Vec::new();
156        let mut fields = Vec::new();
157        let mut stats = Vec::new();
158
159        for (stat, builder) in self
160            .stats
161            .iter()
162            .zip(self.builders.iter_mut())
163            // We sort the stats so the DType is deterministic based on which stats are present.
164            .sorted_unstable_by_key(|&(&s, _)| s)
165        {
166            let values = builder.finish();
167
168            // We drop any all-null stats columns
169            if values
170                .invalid_count()
171                .vortex_expect("failed to get invalid count")
172                == values.len()
173            {
174                continue;
175            }
176
177            stats.push(*stat);
178            names.push(stat.to_string().into());
179            fields.push(values);
180        }
181
182        if names.is_empty() {
183            return None;
184        }
185
186        Some(StatsTable {
187            array: StructArray::try_new(names.into(), fields, self.length, Validity::NonNullable)
188                .vortex_expect("Failed to create stats table")
189                .into_array(),
190            stats: stats.into(),
191        })
192    }
193}