vortex_array/arrays/varbinview/
compact.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Defines a compaction operation for VarBinViewArrays that evicts unused buffers so they can
5//! be dropped.
6
7use vortex_error::VortexResult;
8
9use crate::arrays::VarBinViewArray;
10use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11use crate::validity::Validity;
12use crate::vtable::ValidityHelper;
13
14impl VarBinViewArray {
15    /// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This
16    /// operation can be very expensive, in the worst cast copying all existing string data into
17    /// a new allocation.
18    ///
19    /// After slicing/taking operations `VarBinViewArray`s can continue to hold references to buffers
20    /// that are no longer visible. We detect when there is wasted space in any of the buffers, and if
21    /// so, will aggressively compact all visile outlined string data into a single new buffer.
22    pub fn compact_buffers(&self) -> VortexResult<VarBinViewArray> {
23        // If there is nothing to be gained by compaction, return the original array untouched.
24        if !self.should_compact() {
25            return Ok(self.clone());
26        }
27
28        // Compaction pathways, depend on the validity
29        match self.validity() {
30            // The array contains no values, all buffers can be dropped.
31            // SAFETY: for all-invalid array, zeroed views and buffer because they are never accessed.
32            Validity::AllInvalid => unsafe {
33                Ok(VarBinViewArray::new_unchecked(
34                    self.views().clone(),
35                    Default::default(),
36                    self.dtype().clone(),
37                    self.validity().clone(),
38                ))
39            },
40            // Non-null pathway
41            Validity::NonNullable | Validity::AllValid => rebuild_nonnull(self),
42            // Nullable pathway, requires null-checks for each value
43            Validity::Array(_) => rebuild_nullable(self),
44        }
45    }
46
47    fn should_compact(&self) -> bool {
48        // If the array is entirely inlined strings, do not attempt to compact.
49        if self.nbuffers() == 0 {
50            return false;
51        }
52
53        let bytes_referenced: u64 = self.count_referenced_bytes();
54        let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
55
56        // If there is any wasted space, we want to repack.
57        // This is very aggressive.
58        bytes_referenced < buffer_total_bytes
59    }
60
61    // count the number of bytes addressed by the views, not including null
62    // values or any inlined strings.
63    fn count_referenced_bytes(&self) -> u64 {
64        match self.validity() {
65            Validity::AllInvalid => 0u64,
66            Validity::NonNullable | Validity::AllValid | Validity::Array(_) => self
67                .views()
68                .iter()
69                .enumerate()
70                .map(|(idx, &view)| {
71                    if !self.is_valid(idx) || view.is_inlined() {
72                        0u64
73                    } else {
74                        view.len() as u64
75                    }
76                })
77                .sum(),
78        }
79    }
80}
81
82// Nullable string array compaction pathway.
83// This requires a null check on every append.
84fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
85    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
86    for i in 0..array.len() {
87        if !array.is_valid(i) {
88            builder.append_null();
89        } else {
90            let bytes = array.bytes_at(i);
91            builder.append_value(bytes.as_slice());
92        }
93    }
94
95    Ok(builder.finish_into_varbinview())
96}
97
98// Compaction for string arrays that contain no null values. Saves a branch
99// for every string element.
100fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
101    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
102    for i in 0..array.len() {
103        builder.append_value(array.bytes_at(i).as_ref());
104    }
105    Ok(builder.finish_into_varbinview())
106}
107
108#[cfg(test)]
109mod tests {
110    use vortex_buffer::buffer;
111
112    use crate::IntoArray;
113    use crate::arrays::{VarBinViewArray, VarBinViewVTable};
114    use crate::compute::take;
115
116    #[test]
117    fn test_optimize_compacts_buffers() {
118        // Create a VarBinViewArray with some long strings that will create multiple buffers
119        let original = VarBinViewArray::from_iter_nullable_str([
120            Some("short"),
121            Some("this is a longer string that will be stored in a buffer"),
122            Some("medium length string"),
123            Some("another very long string that definitely needs a buffer to store it"),
124            Some("tiny"),
125        ]);
126
127        // Verify it has buffers
128        assert!(original.nbuffers() > 0);
129        let original_buffers = original.nbuffers();
130
131        // Take only the first and last elements (indices 0 and 4)
132        let indices = buffer![0u32, 4u32].into_array();
133        let taken = take(original.as_ref(), &indices).unwrap();
134        let taken_array = taken.as_::<VarBinViewVTable>();
135
136        // The taken array should still have the same number of buffers
137        assert_eq!(taken_array.nbuffers(), original_buffers);
138
139        // Now optimize the taken array
140        let optimized_array = taken_array.compact_buffers().unwrap();
141
142        // The optimized array should have compacted buffers
143        // Since both remaining strings are short, they should be inlined
144        // so we might have 0 buffers, or 1 buffer if any were not inlined
145        assert!(optimized_array.nbuffers() <= 1);
146
147        // Verify the data is still correct
148        assert_eq!(optimized_array.len(), 2);
149        assert_eq!(optimized_array.scalar_at(0), "short".into());
150        assert_eq!(optimized_array.scalar_at(1), "tiny".into());
151    }
152
153    #[test]
154    fn test_optimize_with_long_strings() {
155        // Create strings that are definitely longer than 12 bytes
156        let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
157        let long_string_2 = "another extremely long string that also needs external buffer storage";
158        let long_string_3 = "yet another long string for testing buffer compaction functionality";
159
160        let original = VarBinViewArray::from_iter_str([
161            long_string_1,
162            long_string_2,
163            long_string_3,
164            "short1",
165            "short2",
166        ]);
167
168        // Take only the first and third long strings (indices 0 and 2)
169        let indices = buffer![0u32, 2u32].into_array();
170        let taken = take(original.as_ref(), &indices).unwrap();
171        let taken_array = taken.as_::<VarBinViewVTable>();
172
173        // Optimize the taken array
174        let optimized_array = taken_array.compact_buffers().unwrap();
175
176        // The optimized array should have exactly 1 buffer (consolidated)
177        assert_eq!(optimized_array.nbuffers(), 1);
178
179        // Verify the data is still correct
180        assert_eq!(optimized_array.len(), 2);
181        assert_eq!(optimized_array.scalar_at(0), long_string_1.into());
182        assert_eq!(optimized_array.scalar_at(1), long_string_3.into());
183    }
184
185    #[test]
186    fn test_optimize_no_buffers() {
187        // Create an array with only short strings (all inlined)
188        let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
189
190        // This should have no buffers
191        assert_eq!(original.nbuffers(), 0);
192
193        // Optimize should return the same array
194        let optimized_array = original.compact_buffers().unwrap();
195
196        assert_eq!(optimized_array.nbuffers(), 0);
197        assert_eq!(optimized_array.len(), 4);
198
199        // Verify all values are preserved
200        for i in 0..4 {
201            assert_eq!(optimized_array.scalar_at(i), original.scalar_at(i));
202        }
203    }
204
205    #[test]
206    fn test_optimize_single_buffer() {
207        // Create an array that naturally has only one buffer
208        let str1 = "this is a long string that goes into a buffer";
209        let str2 = "another long string in the same buffer";
210        let original = VarBinViewArray::from_iter_str([str1, str2]);
211
212        // Should have 1 compact buffer
213        assert_eq!(original.nbuffers(), 1);
214        assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
215
216        // Optimize should return the same array (no change needed)
217        let optimized_array = original.compact_buffers().unwrap();
218
219        assert_eq!(optimized_array.nbuffers(), 1);
220        assert_eq!(optimized_array.len(), 2);
221
222        // Verify all values are preserved
223        for i in 0..2 {
224            assert_eq!(optimized_array.scalar_at(i), original.scalar_at(i));
225        }
226    }
227}