Skip to main content

vortex_file/footer/
file_statistics.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module defines the file statistics component of the Vortex file footer.
5//!
6//! File statistics provide metadata about the data in the file, such as min/max values,
7//! null counts, and other statistical information that can be used for query optimization
8//! and data exploration.
9use std::sync::Arc;
10
11use flatbuffers::FlatBufferBuilder;
12use flatbuffers::WIPOffset;
13use itertools::Itertools;
14use vortex_array::dtype::DType;
15use vortex_array::stats::StatsSet;
16use vortex_error::VortexExpect;
17use vortex_error::VortexResult;
18use vortex_error::vortex_ensure_eq;
19use vortex_flatbuffers::FlatBufferRoot;
20use vortex_flatbuffers::WriteFlatBuffer;
21use vortex_flatbuffers::array::ArrayStats;
22use vortex_flatbuffers::footer as fb;
23use vortex_session::VortexSession;
24
25/// Contains statistical information about the data in a Vortex file.
26///
27/// This struct wraps an array of `StatsSet` objects, each containing statistics
28/// for a field or column in the file. These statistics can be used for query
29/// optimization and data exploration.
30#[derive(Clone, Debug)]
31pub struct FileStatistics {
32    /// An array of statistics sets, one for each field or column in the file.
33    stats: Arc<[StatsSet]>,
34    /// An array of `DType`s, one for each field or column in the file.
35    dtypes: Arc<[DType]>,
36}
37
38impl FileStatistics {
39    /// Creates a new [`FileStatistics`] from the given statistics and data types.
40    ///
41    /// # Panics
42    ///
43    /// Panics if `stats` and `dtypes` have different lengths.
44    pub fn new(stats: Arc<[StatsSet]>, dtypes: Arc<[DType]>) -> Self {
45        assert_eq!(
46            stats.len(),
47            dtypes.len(),
48            "stats and dtypes must have the same length"
49        );
50
51        Self { stats, dtypes }
52    }
53
54    /// Creates a new [`FileStatistics`] from the given statistics and file dtype.
55    ///
56    /// If the [`DType`] of the file is a [`DType::Struct`], then there must be the same number of
57    /// stats as struct fields. Otherwise, there must be only 1 statistic.
58    ///
59    /// # Panics
60    ///
61    /// Panics if the number of stats doesn't match the expected number based on the dtype.
62    pub fn new_with_dtype(stats: Arc<[StatsSet]>, file_dtype: &DType) -> Self {
63        if let DType::Struct(struct_fields, _) = file_dtype {
64            assert_eq!(
65                stats.len(),
66                struct_fields.nfields(),
67                "stats length must match number of struct fields"
68            );
69
70            let dtypes = struct_fields.fields().collect();
71
72            Self { stats, dtypes }
73        } else {
74            assert_eq!(
75                stats.len(),
76                1,
77                "non-struct dtype must have exactly 1 statistic"
78            );
79
80            Self {
81                stats,
82                dtypes: Arc::new([file_dtype.clone()]),
83            }
84        }
85    }
86
87    /// Creates [`FileStatistics`] from a flatbuffers [`fb::FileStatistics<'a>`].
88    ///
89    /// If the [`DType`] of the file is a [`DType::Struct`], then there must be the same number of
90    /// file stats in the flatbuffer. Otherwise, there must be only 1 statistic.
91    pub fn from_flatbuffer<'a>(
92        fb: &fb::FileStatistics<'a>,
93        file_dtype: &DType,
94        session: &VortexSession,
95    ) -> VortexResult<Self> {
96        let field_stats = fb.field_stats().unwrap_or_default();
97        let mut array_stats: Vec<ArrayStats> = field_stats.iter().collect();
98
99        if let DType::Struct(struct_fields, _) = file_dtype {
100            vortex_ensure_eq!(array_stats.len(), struct_fields.nfields());
101
102            let stats_sets: Arc<[StatsSet]> = array_stats
103                .into_iter()
104                .zip(struct_fields.fields())
105                .map(|(array_stat, field_dtype)| {
106                    StatsSet::from_flatbuffer(&array_stat, &field_dtype, session)
107                })
108                .try_collect()?;
109
110            let dtypes = struct_fields.fields().collect();
111
112            Ok(Self {
113                stats: stats_sets,
114                dtypes,
115            })
116        } else {
117            vortex_ensure_eq!(array_stats.len(), 1);
118
119            let array_stat = array_stats
120                .pop()
121                .vortex_expect("we just checked that there was 1 field");
122            let stats_set = StatsSet::from_flatbuffer(&array_stat, file_dtype, session)?;
123
124            Ok(Self {
125                stats: Arc::new([stats_set]),
126                dtypes: Arc::new([file_dtype.clone()]),
127            })
128        }
129    }
130
131    /// Returns a reference to the statistics sets.
132    pub fn stats_sets(&self) -> &Arc<[StatsSet]> {
133        &self.stats
134    }
135
136    /// Returns a reference to the data types.
137    pub fn dtypes(&self) -> &Arc<[DType]> {
138        &self.dtypes
139    }
140
141    /// Returns the statistics and data type for a specific field.
142    ///
143    /// # Panics
144    ///
145    /// Panics if `field_idx` is out of bounds.
146    pub fn get(&self, field_idx: usize) -> (&StatsSet, &DType) {
147        (&self.stats[field_idx], &self.dtypes[field_idx])
148    }
149}
150
151impl<'a> IntoIterator for &'a FileStatistics {
152    type Item = (&'a StatsSet, &'a DType);
153    type IntoIter = std::iter::Zip<std::slice::Iter<'a, StatsSet>, std::slice::Iter<'a, DType>>;
154
155    fn into_iter(self) -> Self::IntoIter {
156        self.stats.iter().zip(self.dtypes.iter())
157    }
158}
159
160impl FlatBufferRoot for FileStatistics {}
161
162impl WriteFlatBuffer for FileStatistics {
163    type Target<'a> = fb::FileStatistics<'a>;
164
165    fn write_flatbuffer<'fb>(
166        &self,
167        fbb: &mut FlatBufferBuilder<'fb>,
168    ) -> VortexResult<WIPOffset<Self::Target<'fb>>> {
169        let field_stats = self
170            .stats_sets()
171            .iter()
172            .map(|s| s.write_flatbuffer(fbb))
173            .collect::<VortexResult<Vec<_>>>()?;
174        let field_stats = fbb.create_vector(field_stats.as_slice());
175
176        Ok(fb::FileStatistics::create(
177            fbb,
178            &fb::FileStatisticsArgs {
179                field_stats: Some(field_stats),
180            },
181        ))
182    }
183}