Skip to main content

vortex_array/arrays/varbin/
builder.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::AsPrimitive;
5use vortex_buffer::BitBufferMut;
6use vortex_buffer::BufferMut;
7use vortex_error::vortex_panic;
8
9use crate::IntoArray;
10#[cfg(debug_assertions)]
11use crate::LEGACY_SESSION;
12#[cfg(debug_assertions)]
13use crate::VortexSessionExecute;
14use crate::arrays::PrimitiveArray;
15use crate::arrays::VarBinArray;
16use crate::dtype::DType;
17use crate::dtype::IntegerPType;
18use crate::expr::stats::Precision;
19use crate::expr::stats::Stat;
20use crate::validity::Validity;
21
22pub struct VarBinBuilder<O: IntegerPType> {
23    offsets: BufferMut<O>,
24    data: BufferMut<u8>,
25    validity: BitBufferMut,
26}
27
28impl<O: IntegerPType> Default for VarBinBuilder<O> {
29    fn default() -> Self {
30        Self::new()
31    }
32}
33
34impl<O: IntegerPType> VarBinBuilder<O> {
35    pub fn new() -> Self {
36        Self::with_capacity(0)
37    }
38
39    pub fn with_capacity(len: usize) -> Self {
40        let mut offsets = BufferMut::with_capacity(len + 1);
41        offsets.push(O::zero());
42        Self {
43            offsets,
44            data: BufferMut::empty(),
45            validity: BitBufferMut::with_capacity(len),
46        }
47    }
48
49    #[inline]
50    pub fn append(&mut self, value: Option<&[u8]>) {
51        match value {
52            Some(v) => self.append_value(v),
53            None => self.append_null(),
54        }
55    }
56
57    #[inline]
58    pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
59        let slice = value.as_ref();
60        self.offsets
61            .push(O::from(self.data.len() + slice.len()).unwrap_or_else(|| {
62                vortex_panic!(
63                    "Failed to convert sum of {} and {} to offset of type {}",
64                    self.data.len(),
65                    slice.len(),
66                    std::any::type_name::<O>()
67                )
68            }));
69        self.data.extend_from_slice(slice);
70        self.validity.append_true();
71    }
72
73    #[inline]
74    pub fn append_null(&mut self) {
75        self.offsets.push(self.offsets[self.offsets.len() - 1]);
76        self.validity.append_false();
77    }
78
79    #[inline]
80    pub fn append_n_nulls(&mut self, n: usize) {
81        self.offsets.push_n(self.offsets[self.offsets.len() - 1], n);
82        self.validity.append_n(false, n);
83    }
84
85    #[inline]
86    pub fn append_values(&mut self, values: &[u8], end_offsets: impl Iterator<Item = O>, num: usize)
87    where
88        O: 'static,
89        usize: AsPrimitive<O>,
90    {
91        self.offsets
92            .extend(end_offsets.map(|offset| offset + self.data.len().as_()));
93        self.data.extend_from_slice(values);
94        self.validity.append_n(true, num);
95    }
96
97    pub fn finish(self, dtype: DType) -> VarBinArray {
98        let offsets = PrimitiveArray::new(self.offsets.freeze(), Validity::NonNullable);
99        let nulls = self.validity.freeze();
100
101        let validity = Validity::from_bit_buffer(nulls, dtype.nullability());
102
103        // The builder guarantees offsets are monotonically increasing, so we can set
104        // this stat eagerly. This avoids an O(n) recomputation when the array is
105        // deserialized and VarBinArray::validate checks sortedness.
106        #[cfg(debug_assertions)]
107        {
108            let offsets_are_sorted = offsets
109                .statistics()
110                .compute_is_sorted(&mut LEGACY_SESSION.create_execution_ctx())
111                .unwrap_or(false);
112            debug_assert!(offsets_are_sorted, "VarBinBuilder offsets must be sorted");
113        }
114        offsets
115            .statistics()
116            .set(Stat::IsSorted, Precision::Exact(true.into()));
117
118        // SAFETY: The builder maintains all invariants:
119        // - Offsets are monotonically increasing starting from 0 (guaranteed by builder logic).
120        // - Bytes buffer contains exactly the data referenced by offsets.
121        // - Validity matches the dtype nullability.
122        // - UTF-8 validity is ensured by the caller when using DType::Utf8.
123        unsafe {
124            VarBinArray::new_unchecked(offsets.into_array(), self.data.freeze(), dtype, validity)
125        }
126    }
127}
128
129#[cfg(test)]
130mod tests {
131    use vortex_error::VortexResult;
132
133    use crate::LEGACY_SESSION;
134    use crate::VortexSessionExecute;
135    use crate::arrays::varbin::VarBinArrayExt;
136    use crate::arrays::varbin::builder::VarBinBuilder;
137    use crate::dtype::DType;
138    use crate::dtype::Nullability::Nullable;
139    use crate::expr::stats::Precision;
140    use crate::expr::stats::Stat;
141    use crate::expr::stats::StatsProviderExt;
142    use crate::scalar::Scalar;
143
144    #[test]
145    fn test_builder() {
146        let mut builder = VarBinBuilder::<i32>::with_capacity(0);
147        builder.append(Some(b"hello"));
148        builder.append(None);
149        builder.append(Some(b"world"));
150        let array = builder.finish(DType::Utf8(Nullable));
151
152        assert_eq!(array.len(), 3);
153        assert_eq!(array.dtype().nullability(), Nullable);
154        assert_eq!(
155            array
156                .execute_scalar(0, &mut LEGACY_SESSION.create_execution_ctx())
157                .unwrap(),
158            Scalar::utf8("hello".to_string(), Nullable)
159        );
160        assert!(
161            array
162                .execute_scalar(1, &mut LEGACY_SESSION.create_execution_ctx())
163                .unwrap()
164                .is_null()
165        );
166    }
167
168    #[test]
169    fn offsets_have_is_sorted_stat() -> VortexResult<()> {
170        let mut builder = VarBinBuilder::<i32>::with_capacity(0);
171        builder.append_value(b"aaa");
172        builder.append_null();
173        builder.append_value(b"bbb");
174        let array = builder.finish(DType::Utf8(Nullable));
175
176        let is_sorted = array
177            .offsets()
178            .statistics()
179            .with_typed_stats_set(|s| s.get_as::<bool>(Stat::IsSorted));
180        assert_eq!(is_sorted, Some(Precision::Exact(true)));
181        Ok(())
182    }
183
184    #[test]
185    fn empty_builder_offsets_have_is_sorted_stat() -> VortexResult<()> {
186        let builder = VarBinBuilder::<i32>::new();
187        let array = builder.finish(DType::Utf8(Nullable));
188
189        let is_sorted = array
190            .offsets()
191            .statistics()
192            .with_typed_stats_set(|s| s.get_as::<bool>(Stat::IsSorted));
193        assert_eq!(is_sorted, Some(Precision::Exact(true)));
194        Ok(())
195    }
196}