Skip to main content

vortex_array/arrays/varbinview/
build_views.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use itertools::Itertools;
5use num_traits::AsPrimitive;
6use vortex_buffer::Buffer;
7use vortex_buffer::BufferMut;
8use vortex_buffer::ByteBuffer;
9use vortex_buffer::ByteBufferMut;
10use vortex_dtype::NativePType;
11
12pub use crate::arrays::BinaryView;
13
14/// Convert an offsets buffer to a buffer of element lengths.
15#[inline]
16pub fn offsets_to_lengths<P: NativePType>(offsets: &[P]) -> Buffer<P> {
17    offsets
18        .iter()
19        .tuple_windows::<(_, _)>()
20        .map(|(&start, &end)| end - start)
21        .collect()
22}
23
24/// Maximum number of buffer bytes that can be referenced by a single `BinaryView`
25pub const MAX_BUFFER_LEN: usize = i32::MAX as usize;
26
27/// Split a large buffer of input `bytes` holding string data
28pub fn build_views<P: NativePType + AsPrimitive<usize>>(
29    start_buf_index: u32,
30    max_buffer_len: usize,
31    mut bytes: ByteBufferMut,
32    lens: &[P],
33) -> (Vec<ByteBuffer>, Buffer<BinaryView>) {
34    let mut views = BufferMut::<BinaryView>::with_capacity(lens.len());
35
36    let mut buffers = Vec::new();
37    let mut buf_index = start_buf_index;
38
39    let mut offset = 0;
40    for &len in lens {
41        let len = len.as_();
42        assert!(len <= max_buffer_len, "values cannot exceed max_buffer_len");
43
44        if (offset + len) > max_buffer_len {
45            // Roll the buffer every 2GiB, to avoid overflowing VarBinView offset field
46            let rest = bytes.split_off(offset);
47
48            buffers.push(bytes.freeze());
49            buf_index += 1;
50            offset = 0;
51
52            bytes = rest;
53        }
54        let view = BinaryView::make_view(&bytes[offset..][..len], buf_index, offset.as_());
55        // SAFETY: we reserved the right capacity beforehand
56        unsafe { views.push_unchecked(view) };
57        offset += len;
58    }
59
60    if !bytes.is_empty() {
61        buffers.push(bytes.freeze());
62    }
63
64    (buffers, views.freeze())
65}
66
67#[cfg(test)]
68mod tests {
69    use vortex_buffer::ByteBuffer;
70    use vortex_buffer::ByteBufferMut;
71
72    use crate::arrays::BinaryView;
73    use crate::arrays::build_views::build_views;
74
75    #[test]
76    fn test_to_canonical_large() {
77        // We are testing generating views for raw data that should look like
78        //
79        //    aaaaaaaaaaaaa ("a"*13)
80        //    bbbbbbbbbbbbb ("b"*13)
81        //    ccccccccccccc ("c"*13)
82        //    ddddddddddddd ("d"*13)
83        //
84        // In real code, this would all fit in one buffer, but to unit test the splitting logic
85        // we split buffers at length 26, which should result in two buffers for the output array.
86        let raw_data =
87            ByteBufferMut::copy_from("aaaaaaaaaaaaabbbbbbbbbbbbbcccccccccccccddddddddddddd");
88        let lens = vec![13u8; 4];
89
90        let (buffers, views) = build_views(0, 26, raw_data, &lens);
91
92        assert_eq!(
93            buffers,
94            vec![
95                ByteBuffer::copy_from("aaaaaaaaaaaaabbbbbbbbbbbbb"),
96                ByteBuffer::copy_from("cccccccccccccddddddddddddd"),
97            ]
98        );
99
100        assert_eq!(
101            views.as_slice(),
102            &[
103                BinaryView::make_view(b"aaaaaaaaaaaaa", 0, 0),
104                BinaryView::make_view(b"bbbbbbbbbbbbb", 0, 13),
105                BinaryView::make_view(b"ccccccccccccc", 1, 0),
106                BinaryView::make_view(b"ddddddddddddd", 1, 13),
107            ]
108        )
109    }
110}