Skip to main content

vortex_file/footer/
file_statistics.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! This module defines the file statistics component of the Vortex file footer.
5//!
6//! File statistics provide metadata about the data in the file, such as min/max values,
7//! null counts, and other statistical information that can be used for query optimization
8//! and data exploration.
9use std::sync::Arc;
10
11use flatbuffers::FlatBufferBuilder;
12use flatbuffers::WIPOffset;
13use itertools::Itertools;
14use vortex_array::stats::StatsSet;
15use vortex_dtype::DType;
16use vortex_error::VortexExpect;
17use vortex_error::VortexResult;
18use vortex_error::vortex_ensure_eq;
19use vortex_flatbuffers::FlatBufferRoot;
20use vortex_flatbuffers::WriteFlatBuffer;
21use vortex_flatbuffers::array::ArrayStats;
22use vortex_flatbuffers::footer as fb;
23
24/// Contains statistical information about the data in a Vortex file.
25///
26/// This struct wraps an array of `StatsSet` objects, each containing statistics
27/// for a field or column in the file. These statistics can be used for query
28/// optimization and data exploration.
29#[derive(Clone, Debug)]
30pub struct FileStatistics {
31    /// An array of statistics sets, one for each field or column in the file.
32    stats: Arc<[StatsSet]>,
33    /// An array of `DType`s, one for each field or column in the file.
34    dtypes: Arc<[DType]>,
35}
36
37impl FileStatistics {
38    /// Creates a new [`FileStatistics`] from the given statistics and data types.
39    ///
40    /// # Panics
41    ///
42    /// Panics if `stats` and `dtypes` have different lengths.
43    pub fn new(stats: Arc<[StatsSet]>, dtypes: Arc<[DType]>) -> Self {
44        assert_eq!(
45            stats.len(),
46            dtypes.len(),
47            "stats and dtypes must have the same length"
48        );
49
50        Self { stats, dtypes }
51    }
52
53    /// Creates a new [`FileStatistics`] from the given statistics and file dtype.
54    ///
55    /// If the [`DType`] of the file is a [`DType::Struct`], then there must be the same number of
56    /// stats as struct fields. Otherwise, there must be only 1 statistic.
57    ///
58    /// # Panics
59    ///
60    /// Panics if the number of stats doesn't match the expected number based on the dtype.
61    pub fn new_with_dtype(stats: Arc<[StatsSet]>, file_dtype: &DType) -> Self {
62        if let DType::Struct(struct_fields, _) = file_dtype {
63            assert_eq!(
64                stats.len(),
65                struct_fields.nfields(),
66                "stats length must match number of struct fields"
67            );
68
69            let dtypes = struct_fields.fields().collect();
70
71            Self { stats, dtypes }
72        } else {
73            assert_eq!(
74                stats.len(),
75                1,
76                "non-struct dtype must have exactly 1 statistic"
77            );
78
79            Self {
80                stats,
81                dtypes: Arc::new([file_dtype.clone()]),
82            }
83        }
84    }
85
86    /// Creates [`FileStatistics`] from a flatbuffers [`fb::FileStatistics<'a>`].
87    ///
88    /// If the [`DType`] of the file is a [`DType::Struct`], then there must be the same number of
89    /// file stats in the flatbuffer. Otherwise, there must be only 1 statistic.
90    pub fn from_flatbuffer<'a>(
91        fb: &fb::FileStatistics<'a>,
92        file_dtype: &DType,
93    ) -> VortexResult<Self> {
94        let field_stats = fb.field_stats().unwrap_or_default();
95        let mut array_stats: Vec<ArrayStats> = field_stats.iter().collect();
96
97        if let DType::Struct(struct_fields, _) = file_dtype {
98            vortex_ensure_eq!(array_stats.len(), struct_fields.nfields());
99
100            let stats_sets: Arc<[StatsSet]> = array_stats
101                .into_iter()
102                .zip(struct_fields.fields())
103                .map(|(array_stat, field_dtype)| {
104                    StatsSet::from_flatbuffer(&array_stat, &field_dtype)
105                })
106                .try_collect()?;
107
108            let dtypes = struct_fields.fields().collect();
109
110            Ok(Self {
111                stats: stats_sets,
112                dtypes,
113            })
114        } else {
115            vortex_ensure_eq!(array_stats.len(), 1);
116
117            let array_stat = array_stats
118                .pop()
119                .vortex_expect("we just checked that there was 1 field");
120            let stats_set = StatsSet::from_flatbuffer(&array_stat, file_dtype)?;
121
122            Ok(Self {
123                stats: Arc::new([stats_set]),
124                dtypes: Arc::new([file_dtype.clone()]),
125            })
126        }
127    }
128
129    /// Returns a reference to the statistics sets.
130    pub fn stats_sets(&self) -> &Arc<[StatsSet]> {
131        &self.stats
132    }
133
134    /// Returns a reference to the data types.
135    pub fn dtypes(&self) -> &Arc<[DType]> {
136        &self.dtypes
137    }
138
139    /// Returns the statistics and data type for a specific field.
140    ///
141    /// # Panics
142    ///
143    /// Panics if `field_idx` is out of bounds.
144    pub fn get(&self, field_idx: usize) -> (&StatsSet, &DType) {
145        (&self.stats[field_idx], &self.dtypes[field_idx])
146    }
147}
148
149impl FlatBufferRoot for FileStatistics {}
150
151impl WriteFlatBuffer for FileStatistics {
152    type Target<'a> = fb::FileStatistics<'a>;
153
154    fn write_flatbuffer<'fb>(
155        &self,
156        fbb: &mut FlatBufferBuilder<'fb>,
157    ) -> VortexResult<WIPOffset<Self::Target<'fb>>> {
158        let field_stats = self
159            .stats_sets()
160            .iter()
161            .map(|s| s.write_flatbuffer(fbb))
162            .collect::<VortexResult<Vec<_>>>()?;
163        let field_stats = fbb.create_vector(field_stats.as_slice());
164
165        Ok(fb::FileStatistics::create(
166            fbb,
167            &fb::FileStatisticsArgs {
168                field_stats: Some(field_stats),
169            },
170        ))
171    }
172}