vortex_array/arrays/varbinview/
compact.rs

1//  SPDX-License-Identifier: Apache-2.0
2//  SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Defines a compaction operation for VarBinViewArrays that evicts unused buffers so they can
5//! be dropped.
6
7use vortex_error::{VortexResult, VortexUnwrap};
8
9use crate::arrays::VarBinViewArray;
10use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11use crate::validity::Validity;
12use crate::vtable::ValidityHelper;
13
14/// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This
15/// operation can be very expensive, in the worst cast copying all existing string data into
16/// a new allocation.
17///
18/// After slicing/taking operations `VarBinViewArray`s can continue to hold references to buffers
19/// that are no longer visible. We detect when there is wasted space in any of the buffers, and if
20/// so, will aggressively compact all visile outlined string data into a single new buffer.
21pub fn compact_buffers(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
22    // If there is nothing to be gained by compaction, return the original array untouched.
23    if !should_compact(array) {
24        return Ok(array.clone());
25    }
26
27    // Compaction pathways, depend on the validity
28    match array.validity() {
29        // The array contains no values, all buffers can be dropped.
30        Validity::AllInvalid => Ok(VarBinViewArray::try_new(
31            array.views().clone(),
32            vec![],
33            array.dtype().clone(),
34            array.validity().clone(),
35        )?),
36        // Non-null pathway
37        Validity::NonNullable | Validity::AllValid => rebuild_nonnull(array),
38        // Nullable pathway, requires null-checks for each value
39        Validity::Array(_) => rebuild_nullable(array),
40    }
41}
42
43fn should_compact(array: &VarBinViewArray) -> bool {
44    // If the array is entirely inlined strings, do not attempt to compact.
45    if array.nbuffers() == 0 {
46        return false;
47    }
48
49    let bytes_referenced: u64 = count_referenced_bytes(array);
50    let buffer_total_bytes: u64 = array.buffers.iter().map(|buf| buf.len() as u64).sum();
51
52    // If there is any wasted space, we want to repack.
53    // This is very aggressive.
54    bytes_referenced < buffer_total_bytes
55}
56
57// count the number of bytes addressed by the views, not including null
58// values or any inlined strings.
59fn count_referenced_bytes(array: &VarBinViewArray) -> u64 {
60    match array.validity() {
61        Validity::AllInvalid => 0u64,
62        _ => {
63            array
64                .views()
65                .iter()
66                .enumerate()
67                .map(|(idx, &view)| {
68                    if !array.is_valid(idx).vortex_unwrap() || view.is_inlined() {
69                        0u64
70                    } else {
71                        // SAFETY: in this branch the view is not inlined.
72                        unsafe { view._ref }.size as u64
73                    }
74                })
75                .sum()
76        }
77    }
78}
79
80// Nullable string array compaction pathway.
81// This requires a null check on every append.
82fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
83    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
84    for i in 0..array.len() {
85        if !array.is_valid(i)? {
86            builder.append_null();
87        } else {
88            let bytes = array.bytes_at(i);
89            builder.append_value(bytes.as_slice());
90        }
91    }
92
93    Ok(builder.finish_into_varbinview())
94}
95
96// Compaction for string arrays that contain no null values. Saves a branch
97// for every string element.
98fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
99    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
100    for i in 0..array.len() {
101        builder.append_value(array.bytes_at(i).as_ref());
102    }
103    Ok(builder.finish_into_varbinview())
104}
105
106#[cfg(test)]
107mod tests {
108    use vortex_buffer::buffer;
109
110    use crate::IntoArray;
111    use crate::arrays::varbinview::compact::compact_buffers;
112    use crate::arrays::{VarBinViewArray, VarBinViewVTable};
113    use crate::compute::take;
114
115    #[test]
116    fn test_optimize_compacts_buffers() {
117        // Create a VarBinViewArray with some long strings that will create multiple buffers
118        let original = VarBinViewArray::from_iter_nullable_str([
119            Some("short"),
120            Some("this is a longer string that will be stored in a buffer"),
121            Some("medium length string"),
122            Some("another very long string that definitely needs a buffer to store it"),
123            Some("tiny"),
124        ]);
125
126        // Verify it has buffers
127        assert!(original.nbuffers() > 0);
128        let original_buffers = original.nbuffers();
129
130        // Take only the first and last elements (indices 0 and 4)
131        let indices = buffer![0u32, 4u32].into_array();
132        let taken = take(original.as_ref(), &indices).unwrap();
133        let taken_array = taken.as_::<VarBinViewVTable>();
134
135        // The taken array should still have the same number of buffers
136        assert_eq!(taken_array.nbuffers(), original_buffers);
137
138        // Now optimize the taken array
139        let optimized_array = compact_buffers(taken_array).unwrap();
140
141        // The optimized array should have compacted buffers
142        // Since both remaining strings are short, they should be inlined
143        // so we might have 0 buffers, or 1 buffer if any were not inlined
144        assert!(optimized_array.nbuffers() <= 1);
145
146        // Verify the data is still correct
147        assert_eq!(optimized_array.len(), 2);
148        assert_eq!(optimized_array.scalar_at(0).unwrap(), "short".into());
149        assert_eq!(optimized_array.scalar_at(1).unwrap(), "tiny".into());
150    }
151
152    #[test]
153    fn test_optimize_with_long_strings() {
154        // Create strings that are definitely longer than 12 bytes
155        let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
156        let long_string_2 = "another extremely long string that also needs external buffer storage";
157        let long_string_3 = "yet another long string for testing buffer compaction functionality";
158
159        let original = VarBinViewArray::from_iter_str([
160            long_string_1,
161            long_string_2,
162            long_string_3,
163            "short1",
164            "short2",
165        ]);
166
167        // Take only the first and third long strings (indices 0 and 2)
168        let indices = buffer![0u32, 2u32].into_array();
169        let taken = take(original.as_ref(), &indices).unwrap();
170        let taken_array = taken.as_::<VarBinViewVTable>();
171
172        // Optimize the taken array
173        let optimized_array = compact_buffers(taken_array).unwrap();
174
175        // The optimized array should have exactly 1 buffer (consolidated)
176        assert_eq!(optimized_array.nbuffers(), 1);
177
178        // Verify the data is still correct
179        assert_eq!(optimized_array.len(), 2);
180        assert_eq!(optimized_array.scalar_at(0).unwrap(), long_string_1.into());
181        assert_eq!(optimized_array.scalar_at(1).unwrap(), long_string_3.into());
182    }
183
184    #[test]
185    fn test_optimize_no_buffers() {
186        // Create an array with only short strings (all inlined)
187        let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
188
189        // This should have no buffers
190        assert_eq!(original.nbuffers(), 0);
191
192        // Optimize should return the same array
193        let optimized_array = compact_buffers(&original).unwrap();
194
195        assert_eq!(optimized_array.nbuffers(), 0);
196        assert_eq!(optimized_array.len(), 4);
197
198        // Verify all values are preserved
199        for i in 0..4 {
200            assert_eq!(
201                optimized_array.scalar_at(i).unwrap(),
202                original.scalar_at(i).unwrap()
203            );
204        }
205    }
206
207    #[test]
208    fn test_optimize_single_buffer() {
209        // Create an array that naturally has only one buffer
210        let str1 = "this is a long string that goes into a buffer";
211        let str2 = "another long string in the same buffer";
212        let original = VarBinViewArray::from_iter_str([str1, str2]);
213
214        // Should have 1 compact buffer
215        assert_eq!(original.nbuffers(), 1);
216        assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
217
218        // Optimize should return the same array (no change needed)
219        let optimized_array = compact_buffers(&original).unwrap();
220
221        assert_eq!(optimized_array.nbuffers(), 1);
222        assert_eq!(optimized_array.len(), 2);
223
224        // Verify all values are preserved
225        for i in 0..2 {
226            assert_eq!(
227                optimized_array.scalar_at(i).unwrap(),
228                original.scalar_at(i).unwrap()
229            );
230        }
231    }
232}