Skip to main content

vortex_layout/layouts/zoned/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod builder;
5mod reader;
6pub mod writer;
7pub mod zone_map;
8
9use std::sync::Arc;
10
11pub use builder::MAX_IS_TRUNCATED;
12pub use builder::MIN_IS_TRUNCATED;
13use vortex_array::DeserializeMetadata;
14use vortex_array::SerializeMetadata;
15use vortex_array::dtype::DType;
16use vortex_array::dtype::TryFromBytes;
17use vortex_array::expr::stats::Stat;
18use vortex_array::stats::as_stat_bitset_bytes;
19use vortex_array::stats::stats_from_bitset_bytes;
20use vortex_error::VortexExpect;
21use vortex_error::VortexResult;
22use vortex_error::vortex_bail;
23use vortex_error::vortex_panic;
24use vortex_session::VortexSession;
25use vortex_session::registry::ReadContext;
26
27use crate::LayoutChildType;
28use crate::LayoutEncodingRef;
29use crate::LayoutId;
30use crate::LayoutReaderRef;
31use crate::LayoutRef;
32use crate::VTable;
33use crate::children::LayoutChildren;
34use crate::children::OwnedLayoutChildren;
35use crate::layouts::zoned::reader::ZonedReader;
36use crate::layouts::zoned::zone_map::ZoneMap;
37use crate::segments::SegmentId;
38use crate::segments::SegmentSource;
39use crate::vtable;
40
41vtable!(Zoned);
42
43impl VTable for Zoned {
44    type Layout = ZonedLayout;
45    type Encoding = ZonedLayoutEncoding;
46    type Metadata = ZonedMetadata;
47
48    fn id(_encoding: &Self::Encoding) -> LayoutId {
49        LayoutId::new_ref("vortex.stats") // For legacy reasons, this is called stats
50    }
51
52    fn encoding(_layout: &Self::Layout) -> LayoutEncodingRef {
53        LayoutEncodingRef::new_ref(ZonedLayoutEncoding.as_ref())
54    }
55
56    fn row_count(layout: &Self::Layout) -> u64 {
57        layout.children.child_row_count(0)
58    }
59
60    fn dtype(layout: &Self::Layout) -> &DType {
61        &layout.dtype
62    }
63
64    fn metadata(layout: &Self::Layout) -> Self::Metadata {
65        ZonedMetadata {
66            zone_len: u32::try_from(layout.zone_len).vortex_expect("Invalid zone length"),
67            present_stats: layout.present_stats.clone(),
68        }
69    }
70
71    fn segment_ids(_layout: &Self::Layout) -> Vec<SegmentId> {
72        vec![]
73    }
74
75    fn nchildren(_layout: &Self::Layout) -> usize {
76        2
77    }
78
79    fn child(layout: &Self::Layout, idx: usize) -> VortexResult<LayoutRef> {
80        match idx {
81            0 => layout.children.child(0, layout.dtype()),
82            1 => layout.children.child(
83                1,
84                &ZoneMap::dtype_for_stats_table(layout.dtype(), &layout.present_stats),
85            ),
86            _ => vortex_bail!("Invalid child index: {}", idx),
87        }
88    }
89
90    fn child_type(_layout: &Self::Layout, idx: usize) -> LayoutChildType {
91        match idx {
92            0 => LayoutChildType::Transparent("data".into()),
93            1 => LayoutChildType::Auxiliary("zones".into()),
94            _ => vortex_panic!("Invalid child index: {}", idx),
95        }
96    }
97
98    fn new_reader(
99        layout: &Self::Layout,
100        name: Arc<str>,
101        segment_source: Arc<dyn SegmentSource>,
102        session: &VortexSession,
103    ) -> VortexResult<LayoutReaderRef> {
104        Ok(Arc::new(ZonedReader::try_new(
105            layout.clone(),
106            name,
107            segment_source,
108            session.clone(),
109        )?))
110    }
111
112    fn build(
113        _encoding: &Self::Encoding,
114        dtype: &DType,
115        _row_count: u64,
116        metadata: &ZonedMetadata,
117        _segment_ids: Vec<SegmentId>,
118        children: &dyn LayoutChildren,
119        _ctx: &ReadContext,
120    ) -> VortexResult<Self::Layout> {
121        Ok(ZonedLayout {
122            dtype: dtype.clone(),
123            children: children.to_arc(),
124            zone_len: metadata.zone_len as usize,
125            present_stats: metadata.present_stats.clone(),
126        })
127    }
128
129    fn with_children(layout: &mut Self::Layout, children: Vec<LayoutRef>) -> VortexResult<()> {
130        if children.len() != 2 {
131            vortex_bail!(
132                "ZonedLayout expects exactly 2 children (data, zones), got {}",
133                children.len()
134            );
135        }
136        layout.children = OwnedLayoutChildren::layout_children(children);
137        Ok(())
138    }
139}
140
141#[derive(Debug)]
142pub struct ZonedLayoutEncoding;
143
144/// Annotates a data layout with per-zone aggregate statistics (e.g. min, max, null count).
145///
146/// During reads, zone maps allow entire zones to be skipped when a filter predicate cannot match.
147#[derive(Clone, Debug)]
148pub struct ZonedLayout {
149    dtype: DType,
150    children: Arc<dyn LayoutChildren>,
151    zone_len: usize,
152    present_stats: Arc<[Stat]>,
153}
154
155impl ZonedLayout {
156    pub fn new(
157        data: LayoutRef,
158        zones: LayoutRef,
159        zone_len: usize,
160        present_stats: Arc<[Stat]>,
161    ) -> Self {
162        if zone_len == 0 {
163            vortex_panic!("Zone length must be greater than 0");
164        }
165        let expected_dtype = ZoneMap::dtype_for_stats_table(data.dtype(), &present_stats);
166        if zones.dtype() != &expected_dtype {
167            vortex_panic!("Invalid zone map layout: zones dtype does not match expected dtype");
168        }
169        Self {
170            dtype: data.dtype().clone(),
171            children: OwnedLayoutChildren::layout_children(vec![data, zones]),
172            zone_len,
173            present_stats,
174        }
175    }
176
177    pub fn nzones(&self) -> usize {
178        usize::try_from(self.children.child_row_count(1)).vortex_expect("Invalid number of zones")
179    }
180
181    /// Returns an array of stats that exist in the layout's data, must be sorted.
182    pub fn present_stats(&self) -> &Arc<[Stat]> {
183        &self.present_stats
184    }
185}
186
187#[derive(Debug, PartialEq, Eq, Clone)]
188pub struct ZonedMetadata {
189    pub(super) zone_len: u32,
190    pub(super) present_stats: Arc<[Stat]>,
191}
192
193impl DeserializeMetadata for ZonedMetadata {
194    type Output = Self;
195
196    fn deserialize(metadata: &[u8]) -> VortexResult<Self::Output> {
197        let zone_len = u32::try_from_le_bytes(&metadata[0..4])?;
198        let present_stats: Arc<[Stat]> = stats_from_bitset_bytes(&metadata[4..]).into();
199        Ok(Self {
200            zone_len,
201            present_stats,
202        })
203    }
204}
205
206impl SerializeMetadata for ZonedMetadata {
207    fn serialize(self) -> Vec<u8> {
208        let mut metadata = vec![];
209        // First, write the block size to the metadata.
210        metadata.extend_from_slice(&self.zone_len.to_le_bytes());
211        // Then write the bit-set of statistics.
212        metadata.extend_from_slice(&as_stat_bitset_bytes(&self.present_stats));
213        metadata
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use rstest::rstest;
220
221    use super::*;
222
223    #[rstest]
224    #[case(ZonedMetadata {
225            zone_len: u32::MAX,
226            present_stats: Arc::new([]),
227        })]
228    #[case(ZonedMetadata {
229            zone_len: 0,
230            present_stats: Arc::new([Stat::IsConstant]),
231        })]
232    #[case::all_sorted(ZonedMetadata {
233            zone_len: 314,
234            present_stats: Arc::new([Stat::IsConstant, Stat::IsSorted, Stat::IsStrictSorted, Stat::Max, Stat::Min, Stat::Sum, Stat::NullCount, Stat::UncompressedSizeInBytes, Stat::NaNCount]),
235        })]
236    #[case::some_sorted(ZonedMetadata {
237            zone_len: 314,
238            present_stats: Arc::new([Stat::IsSorted, Stat::IsStrictSorted, Stat::Max, Stat::Min, Stat::Sum, Stat::NullCount, Stat::UncompressedSizeInBytes, Stat::NaNCount]),
239        })]
240    fn test_metadata_serialization(#[case] metadata: ZonedMetadata) {
241        let serialized = metadata.clone().serialize();
242        let deserialized = ZonedMetadata::deserialize(&serialized).unwrap();
243        assert_eq!(deserialized, metadata);
244    }
245
246    #[test]
247    fn test_deserialize_unsorted_stats() {
248        let metadata = ZonedMetadata {
249            zone_len: u32::MAX,
250            present_stats: Arc::new([Stat::IsStrictSorted, Stat::IsSorted]),
251        };
252        let serialized = metadata.clone().serialize();
253        let deserialized = ZonedMetadata::deserialize(&serialized).unwrap();
254        assert!(deserialized.present_stats.is_sorted());
255        assert_eq!(
256            deserialized.present_stats.len(),
257            metadata.present_stats.len()
258        );
259        assert_ne!(deserialized.present_stats, metadata.present_stats);
260    }
261}