vortex_layout/layouts/flat/
writer.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
use vortex_array::parts::ArrayPartsFlatBuffer;
use vortex_array::stats::{ArrayStatistics, Stat, STATS_TO_WRITE};
use vortex_array::ArrayData;
use vortex_dtype::DType;
use vortex_error::{vortex_bail, vortex_err, VortexResult};
use vortex_flatbuffers::WriteFlatBufferExt;

use crate::layouts::flat::FlatLayout;
use crate::segments::SegmentWriter;
use crate::strategies::LayoutWriter;
use crate::LayoutData;

pub struct FlatLayoutOptions {
    /// Stats to preserve when writing arrays
    pub array_stats: Vec<Stat>,
}

impl Default for FlatLayoutOptions {
    fn default() -> Self {
        Self {
            array_stats: STATS_TO_WRITE.to_vec(),
        }
    }
}

/// Writer for the flat layout.
pub struct FlatLayoutWriter {
    options: FlatLayoutOptions,
    dtype: DType,
    layout: Option<LayoutData>,
}

impl FlatLayoutWriter {
    pub fn new(dtype: DType, options: FlatLayoutOptions) -> Self {
        Self {
            options,
            dtype,
            layout: None,
        }
    }
}

fn retain_only_stats(array: &ArrayData, stats: &[Stat]) {
    array.statistics().retain_only(stats);
    for child in array.children() {
        retain_only_stats(&child, stats)
    }
}

impl LayoutWriter for FlatLayoutWriter {
    fn push_chunk(
        &mut self,
        segments: &mut dyn SegmentWriter,
        chunk: ArrayData,
    ) -> VortexResult<()> {
        if self.layout.is_some() {
            vortex_bail!("FlatLayoutStrategy::push_batch called after finish");
        }
        let row_count = chunk.len() as u64;
        retain_only_stats(&chunk, &self.options.array_stats);

        // We store each Array buffer in its own segment.
        let mut segment_ids = vec![];
        for child in chunk.depth_first_traversal() {
            for buffer in child.byte_buffers() {
                // TODO(ngates): decide a way of splitting buffers if they exceed u32 size.
                //  We could write empty segments either side of buffers to concatenate?
                //  Or we could use Layout::metadata to store this information.
                segment_ids.push(segments.put(buffer));
            }
        }

        // ...followed by a FlatBuffer describing the array layout.
        let flatbuffer = ArrayPartsFlatBuffer::new(&chunk).write_flatbuffer_bytes();
        segment_ids.push(segments.put(flatbuffer.into_inner()));

        self.layout = Some(LayoutData::new_owned(
            &FlatLayout,
            self.dtype.clone(),
            row_count,
            Some(segment_ids),
            None,
            None,
        ));
        Ok(())
    }

    fn finish(&mut self, _segments: &mut dyn SegmentWriter) -> VortexResult<LayoutData> {
        self.layout
            .take()
            .ok_or_else(|| vortex_err!("FlatLayoutStrategy::finish called without push_batch"))
    }
}

#[cfg(test)]
mod tests {
    use std::sync::Arc;

    use futures::executor::block_on;
    use vortex_array::array::PrimitiveArray;
    use vortex_array::stats::{ArrayStatistics, Stat};
    use vortex_array::validity::Validity;
    use vortex_array::{ArrayDType, ToArrayData};
    use vortex_buffer::buffer;
    use vortex_expr::ident;
    use vortex_scan::RowMask;

    use crate::layouts::flat::writer::FlatLayoutWriter;
    use crate::segments::test::TestSegments;
    use crate::strategies::LayoutWriterExt;

    #[test]
    fn flat_stats() {
        block_on(async {
            let mut segments = TestSegments::default();
            let array = PrimitiveArray::new(buffer![1, 2, 3, 4, 5], Validity::AllValid);
            assert!(array.statistics().compute_bit_width_freq().is_some());
            assert!(array.statistics().compute_trailing_zero_freq().is_some());
            let layout = FlatLayoutWriter::new(array.dtype().clone(), Default::default())
                .push_one(&mut segments, array.to_array())
                .unwrap();

            let result = layout
                .reader(Arc::new(segments), Default::default())
                .unwrap()
                .evaluate_expr(RowMask::new_valid_between(0, layout.row_count()), ident())
                .await
                .unwrap();

            assert!(result.statistics().get(Stat::BitWidthFreq).is_none());
            assert!(result.statistics().get(Stat::TrailingZeroFreq).is_none());
        })
    }
}