vortex_layout/layouts/zoned/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod builder;
5mod reader;
6pub mod writer;
7pub mod zone_map;
8
9use std::sync::Arc;
10
11pub use builder::{MAX_IS_TRUNCATED, MIN_IS_TRUNCATED, lower_bound, upper_bound};
12use vortex_array::stats::{Stat, as_stat_bitset_bytes, stats_from_bitset_bytes};
13use vortex_array::{ArrayContext, DeserializeMetadata, SerializeMetadata};
14use vortex_dtype::{DType, TryFromBytes};
15use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_panic};
16
17use crate::children::{LayoutChildren, OwnedLayoutChildren};
18use crate::layouts::zoned::reader::ZonedReader;
19use crate::layouts::zoned::zone_map::ZoneMap;
20use crate::segments::{SegmentId, SegmentSource};
21use crate::{
22    LayoutChildType, LayoutEncodingRef, LayoutId, LayoutReaderRef, LayoutRef, VTable, vtable,
23};
24
25vtable!(Zoned);
26
27impl VTable for ZonedVTable {
28    type Layout = ZonedLayout;
29    type Encoding = ZonedLayoutEncoding;
30    type Metadata = ZonedMetadata;
31
32    fn id(_encoding: &Self::Encoding) -> LayoutId {
33        LayoutId::new_ref("vortex.stats") // For legacy reasons, this is called stats
34    }
35
36    fn encoding(_layout: &Self::Layout) -> LayoutEncodingRef {
37        LayoutEncodingRef::new_ref(ZonedLayoutEncoding.as_ref())
38    }
39
40    fn row_count(layout: &Self::Layout) -> u64 {
41        layout.children.child_row_count(0)
42    }
43
44    fn dtype(layout: &Self::Layout) -> &DType {
45        &layout.dtype
46    }
47
48    fn metadata(layout: &Self::Layout) -> Self::Metadata {
49        ZonedMetadata {
50            zone_len: u32::try_from(layout.zone_len).vortex_expect("Invalid zone length"),
51            present_stats: layout.present_stats.clone(),
52        }
53    }
54
55    fn segment_ids(_layout: &Self::Layout) -> Vec<SegmentId> {
56        vec![]
57    }
58
59    fn nchildren(_layout: &Self::Layout) -> usize {
60        2
61    }
62
63    fn child(layout: &Self::Layout, idx: usize) -> VortexResult<LayoutRef> {
64        match idx {
65            0 => layout.children.child(0, layout.dtype()),
66            1 => layout.children.child(
67                1,
68                &ZoneMap::dtype_for_stats_table(layout.dtype(), &layout.present_stats),
69            ),
70            _ => vortex_bail!("Invalid child index: {}", idx),
71        }
72    }
73
74    fn child_type(_layout: &Self::Layout, idx: usize) -> LayoutChildType {
75        match idx {
76            0 => LayoutChildType::Transparent("data".into()),
77            1 => LayoutChildType::Auxiliary("zones".into()),
78            _ => vortex_panic!("Invalid child index: {}", idx),
79        }
80    }
81
82    fn new_reader(
83        layout: &Self::Layout,
84        name: Arc<str>,
85        segment_source: Arc<dyn SegmentSource>,
86    ) -> VortexResult<LayoutReaderRef> {
87        Ok(Arc::new(ZonedReader::try_new(
88            layout.clone(),
89            name,
90            segment_source,
91        )?))
92    }
93
94    #[cfg(gpu_unstable)]
95    fn new_gpu_reader(
96        layout: &Self::Layout,
97        name: Arc<str>,
98        segment_source: Arc<dyn SegmentSource>,
99        ctx: Arc<cudarc::driver::CudaContext>,
100    ) -> VortexResult<crate::gpu::GpuLayoutReaderRef> {
101        // skip prunning and immediately return data child
102        layout
103            .children
104            .child(0, layout.dtype())?
105            .new_gpu_reader(name, segment_source, ctx)
106    }
107
108    fn build(
109        _encoding: &Self::Encoding,
110        dtype: &DType,
111        _row_count: u64,
112        metadata: &ZonedMetadata,
113        _segment_ids: Vec<SegmentId>,
114        children: &dyn LayoutChildren,
115        _ctx: ArrayContext,
116    ) -> VortexResult<Self::Layout> {
117        Ok(ZonedLayout {
118            dtype: dtype.clone(),
119            children: children.to_arc(),
120            zone_len: metadata.zone_len as usize,
121            present_stats: metadata.present_stats.clone(),
122        })
123    }
124}
125
126#[derive(Debug)]
127pub struct ZonedLayoutEncoding;
128
129#[derive(Clone, Debug)]
130pub struct ZonedLayout {
131    dtype: DType,
132    children: Arc<dyn LayoutChildren>,
133    zone_len: usize,
134    present_stats: Arc<[Stat]>,
135}
136
137impl ZonedLayout {
138    pub fn new(
139        data: LayoutRef,
140        zones: LayoutRef,
141        zone_len: usize,
142        present_stats: Arc<[Stat]>,
143    ) -> Self {
144        if zone_len == 0 {
145            vortex_panic!("Zone length must be greater than 0");
146        }
147        let expected_dtype = ZoneMap::dtype_for_stats_table(data.dtype(), &present_stats);
148        if zones.dtype() != &expected_dtype {
149            vortex_panic!("Invalid zone map layout: zones dtype does not match expected dtype");
150        }
151        Self {
152            dtype: data.dtype().clone(),
153            children: OwnedLayoutChildren::layout_children(vec![data, zones]),
154            zone_len,
155            present_stats,
156        }
157    }
158
159    pub fn nzones(&self) -> usize {
160        usize::try_from(self.children.child_row_count(1)).vortex_expect("Invalid number of zones")
161    }
162
163    /// Returns an array of stats that exist in the layout's data, must be sorted.
164    pub fn present_stats(&self) -> &Arc<[Stat]> {
165        &self.present_stats
166    }
167}
168
169#[derive(Debug, PartialEq, Eq, Clone)]
170pub struct ZonedMetadata {
171    pub(super) zone_len: u32,
172    pub(super) present_stats: Arc<[Stat]>,
173}
174
175impl DeserializeMetadata for ZonedMetadata {
176    type Output = Self;
177
178    fn deserialize(metadata: &[u8]) -> VortexResult<Self::Output> {
179        let zone_len = u32::try_from_le_bytes(&metadata[0..4])?;
180        let present_stats: Arc<[Stat]> = stats_from_bitset_bytes(&metadata[4..]).into();
181        Ok(Self {
182            zone_len,
183            present_stats,
184        })
185    }
186}
187
188impl SerializeMetadata for ZonedMetadata {
189    fn serialize(self) -> Vec<u8> {
190        let mut metadata = vec![];
191        // First, write the block size to the metadata.
192        metadata.extend_from_slice(&self.zone_len.to_le_bytes());
193        // Then write the bit-set of statistics.
194        metadata.extend_from_slice(&as_stat_bitset_bytes(&self.present_stats));
195        metadata
196    }
197}
198
199#[cfg(test)]
200mod tests {
201    use rstest::rstest;
202
203    use super::*;
204
205    #[rstest]
206    #[case(ZonedMetadata {
207            zone_len: u32::MAX,
208            present_stats: Arc::new([]),
209        })]
210    #[case(ZonedMetadata {
211            zone_len: 0,
212            present_stats: Arc::new([Stat::IsConstant]),
213        })]
214    #[case::all_sorted(ZonedMetadata {
215            zone_len: 314,
216            present_stats: Arc::new([Stat::IsConstant, Stat::IsSorted, Stat::IsStrictSorted, Stat::Max, Stat::Min, Stat::Sum, Stat::NullCount, Stat::UncompressedSizeInBytes, Stat::NaNCount]),
217        })]
218    #[case::some_sorted(ZonedMetadata {
219            zone_len: 314,
220            present_stats: Arc::new([Stat::IsSorted, Stat::IsStrictSorted, Stat::Max, Stat::Min, Stat::Sum, Stat::NullCount, Stat::UncompressedSizeInBytes, Stat::NaNCount]),
221        })]
222    fn test_metadata_serialization(#[case] metadata: ZonedMetadata) {
223        let serialized = metadata.clone().serialize();
224        let deserialized = ZonedMetadata::deserialize(&serialized).unwrap();
225        assert_eq!(deserialized, metadata);
226    }
227
228    #[test]
229    fn test_deserialize_unsorted_stats() {
230        let metadata = ZonedMetadata {
231            zone_len: u32::MAX,
232            present_stats: Arc::new([Stat::IsStrictSorted, Stat::IsSorted]),
233        };
234        let serialized = metadata.clone().serialize();
235        let deserialized = ZonedMetadata::deserialize(&serialized).unwrap();
236        assert!(deserialized.present_stats.is_sorted());
237        assert_eq!(
238            deserialized.present_stats.len(),
239            metadata.present_stats.len()
240        );
241        assert_ne!(deserialized.present_stats, metadata.present_stats);
242    }
243}