Skip to main content

vortex_array/arrays/varbin/
builder.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::AsPrimitive;
5use vortex_buffer::BitBufferMut;
6use vortex_buffer::BufferMut;
7use vortex_dtype::DType;
8use vortex_dtype::IntegerPType;
9use vortex_error::vortex_panic;
10
11use crate::IntoArray;
12use crate::arrays::primitive::PrimitiveArray;
13use crate::arrays::varbin::VarBinArray;
14use crate::expr::stats::Precision;
15use crate::expr::stats::Stat;
16use crate::validity::Validity;
17
18pub struct VarBinBuilder<O: IntegerPType> {
19    offsets: BufferMut<O>,
20    data: BufferMut<u8>,
21    validity: BitBufferMut,
22}
23
24impl<O: IntegerPType> Default for VarBinBuilder<O> {
25    fn default() -> Self {
26        Self::new()
27    }
28}
29
30impl<O: IntegerPType> VarBinBuilder<O> {
31    pub fn new() -> Self {
32        Self::with_capacity(0)
33    }
34
35    pub fn with_capacity(len: usize) -> Self {
36        let mut offsets = BufferMut::with_capacity(len + 1);
37        offsets.push(O::zero());
38        Self {
39            offsets,
40            data: BufferMut::empty(),
41            validity: BitBufferMut::with_capacity(len),
42        }
43    }
44
45    #[inline]
46    pub fn append(&mut self, value: Option<&[u8]>) {
47        match value {
48            Some(v) => self.append_value(v),
49            None => self.append_null(),
50        }
51    }
52
53    #[inline]
54    pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
55        let slice = value.as_ref();
56        self.offsets
57            .push(O::from(self.data.len() + slice.len()).unwrap_or_else(|| {
58                vortex_panic!(
59                    "Failed to convert sum of {} and {} to offset of type {}",
60                    self.data.len(),
61                    slice.len(),
62                    std::any::type_name::<O>()
63                )
64            }));
65        self.data.extend_from_slice(slice);
66        self.validity.append_true();
67    }
68
69    #[inline]
70    pub fn append_null(&mut self) {
71        self.offsets.push(self.offsets[self.offsets.len() - 1]);
72        self.validity.append_false();
73    }
74
75    #[inline]
76    pub fn append_n_nulls(&mut self, n: usize) {
77        self.offsets.push_n(self.offsets[self.offsets.len() - 1], n);
78        self.validity.append_n(false, n);
79    }
80
81    #[inline]
82    pub fn append_values(&mut self, values: &[u8], end_offsets: impl Iterator<Item = O>, num: usize)
83    where
84        O: 'static,
85        usize: AsPrimitive<O>,
86    {
87        self.offsets
88            .extend(end_offsets.map(|offset| offset + self.data.len().as_()));
89        self.data.extend_from_slice(values);
90        self.validity.append_n(true, num);
91    }
92
93    pub fn finish(self, dtype: DType) -> VarBinArray {
94        let offsets = PrimitiveArray::new(self.offsets.freeze(), Validity::NonNullable);
95        let nulls = self.validity.freeze();
96
97        let validity = Validity::from_bit_buffer(nulls, dtype.nullability());
98
99        // The builder guarantees offsets are monotonically increasing, so we can set
100        // this stat eagerly. This avoids an O(n) recomputation when the array is
101        // deserialized and VarBinArray::validate checks sortedness.
102        debug_assert!(
103            offsets.statistics().compute_is_sorted().unwrap_or(false),
104            "VarBinBuilder offsets must be sorted"
105        );
106        offsets
107            .statistics()
108            .set(Stat::IsSorted, Precision::Exact(true.into()));
109
110        // SAFETY: The builder maintains all invariants:
111        // - Offsets are monotonically increasing starting from 0 (guaranteed by builder logic).
112        // - Bytes buffer contains exactly the data referenced by offsets.
113        // - Validity matches the dtype nullability.
114        // - UTF-8 validity is ensured by the caller when using DType::Utf8.
115        unsafe {
116            VarBinArray::new_unchecked(offsets.into_array(), self.data.freeze(), dtype, validity)
117        }
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use vortex_dtype::DType;
124    use vortex_dtype::Nullability::Nullable;
125    use vortex_error::VortexResult;
126
127    use crate::arrays::varbin::builder::VarBinBuilder;
128    use crate::expr::stats::Precision;
129    use crate::expr::stats::Stat;
130    use crate::expr::stats::StatsProviderExt;
131    use crate::scalar::Scalar;
132
133    #[test]
134    fn test_builder() {
135        let mut builder = VarBinBuilder::<i32>::with_capacity(0);
136        builder.append(Some(b"hello"));
137        builder.append(None);
138        builder.append(Some(b"world"));
139        let array = builder.finish(DType::Utf8(Nullable));
140
141        assert_eq!(array.len(), 3);
142        assert_eq!(array.dtype().nullability(), Nullable);
143        assert_eq!(
144            array.scalar_at(0).unwrap(),
145            Scalar::utf8("hello".to_string(), Nullable)
146        );
147        assert!(array.scalar_at(1).unwrap().is_null());
148    }
149
150    #[test]
151    fn offsets_have_is_sorted_stat() -> VortexResult<()> {
152        let mut builder = VarBinBuilder::<i32>::with_capacity(0);
153        builder.append_value(b"aaa");
154        builder.append_null();
155        builder.append_value(b"bbb");
156        let array = builder.finish(DType::Utf8(Nullable));
157
158        let is_sorted = array
159            .offsets()
160            .statistics()
161            .with_typed_stats_set(|s| s.get_as::<bool>(Stat::IsSorted));
162        assert_eq!(is_sorted, Some(Precision::Exact(true)));
163        Ok(())
164    }
165
166    #[test]
167    fn empty_builder_offsets_have_is_sorted_stat() -> VortexResult<()> {
168        let builder = VarBinBuilder::<i32>::new();
169        let array = builder.finish(DType::Utf8(Nullable));
170
171        let is_sorted = array
172            .offsets()
173            .statistics()
174            .with_typed_stats_set(|s| s.get_as::<bool>(Stat::IsSorted));
175        assert_eq!(is_sorted, Some(Precision::Exact(true)));
176        Ok(())
177    }
178}