vortex_array/arrays/varbinview/
compact.rs

1//  SPDX-License-Identifier: Apache-2.0
2//  SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Defines a compaction operation for VarBinViewArrays that evicts unused buffers so they can
5//! be dropped.
6
7use vortex_error::{VortexResult, VortexUnwrap};
8
9use crate::arrays::VarBinViewArray;
10use crate::builders::{ArrayBuilder, VarBinViewBuilder};
11use crate::validity::Validity;
12use crate::vtable::ValidityHelper;
13
14impl VarBinViewArray {
15    /// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This
16    /// operation can be very expensive, in the worst cast copying all existing string data into
17    /// a new allocation.
18    ///
19    /// After slicing/taking operations `VarBinViewArray`s can continue to hold references to buffers
20    /// that are no longer visible. We detect when there is wasted space in any of the buffers, and if
21    /// so, will aggressively compact all visile outlined string data into a single new buffer.
22    pub fn compact_buffers(&self) -> VortexResult<VarBinViewArray> {
23        // If there is nothing to be gained by compaction, return the original array untouched.
24        if !self.should_compact() {
25            return Ok(self.clone());
26        }
27
28        // Compaction pathways, depend on the validity
29        match self.validity() {
30            // The array contains no values, all buffers can be dropped.
31            Validity::AllInvalid => Ok(VarBinViewArray::try_new(
32                self.views().clone(),
33                Default::default(),
34                self.dtype().clone(),
35                self.validity().clone(),
36            )?),
37            // Non-null pathway
38            Validity::NonNullable | Validity::AllValid => rebuild_nonnull(self),
39            // Nullable pathway, requires null-checks for each value
40            Validity::Array(_) => rebuild_nullable(self),
41        }
42    }
43
44    fn should_compact(&self) -> bool {
45        // If the array is entirely inlined strings, do not attempt to compact.
46        if self.nbuffers() == 0 {
47            return false;
48        }
49
50        let bytes_referenced: u64 = self.count_referenced_bytes();
51        let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
52
53        // If there is any wasted space, we want to repack.
54        // This is very aggressive.
55        bytes_referenced < buffer_total_bytes
56    }
57
58    // count the number of bytes addressed by the views, not including null
59    // values or any inlined strings.
60    fn count_referenced_bytes(&self) -> u64 {
61        match self.validity() {
62            Validity::AllInvalid => 0u64,
63            _ => self
64                .views()
65                .iter()
66                .enumerate()
67                .map(|(idx, &view)| {
68                    if !self.is_valid(idx).vortex_unwrap() || view.is_inlined() {
69                        0u64
70                    } else {
71                        view.len() as u64
72                    }
73                })
74                .sum(),
75        }
76    }
77}
78
79// Nullable string array compaction pathway.
80// This requires a null check on every append.
81fn rebuild_nullable(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
82    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
83    for i in 0..array.len() {
84        if !array.is_valid(i)? {
85            builder.append_null();
86        } else {
87            let bytes = array.bytes_at(i);
88            builder.append_value(bytes.as_slice());
89        }
90    }
91
92    Ok(builder.finish_into_varbinview())
93}
94
95// Compaction for string arrays that contain no null values. Saves a branch
96// for every string element.
97fn rebuild_nonnull(array: &VarBinViewArray) -> VortexResult<VarBinViewArray> {
98    let mut builder = VarBinViewBuilder::with_capacity(array.dtype().clone(), array.len());
99    for i in 0..array.len() {
100        builder.append_value(array.bytes_at(i).as_ref());
101    }
102    Ok(builder.finish_into_varbinview())
103}
104
105#[cfg(test)]
106mod tests {
107    use vortex_buffer::buffer;
108
109    use crate::IntoArray;
110    use crate::arrays::{VarBinViewArray, VarBinViewVTable};
111    use crate::compute::take;
112
113    #[test]
114    fn test_optimize_compacts_buffers() {
115        // Create a VarBinViewArray with some long strings that will create multiple buffers
116        let original = VarBinViewArray::from_iter_nullable_str([
117            Some("short"),
118            Some("this is a longer string that will be stored in a buffer"),
119            Some("medium length string"),
120            Some("another very long string that definitely needs a buffer to store it"),
121            Some("tiny"),
122        ]);
123
124        // Verify it has buffers
125        assert!(original.nbuffers() > 0);
126        let original_buffers = original.nbuffers();
127
128        // Take only the first and last elements (indices 0 and 4)
129        let indices = buffer![0u32, 4u32].into_array();
130        let taken = take(original.as_ref(), &indices).unwrap();
131        let taken_array = taken.as_::<VarBinViewVTable>();
132
133        // The taken array should still have the same number of buffers
134        assert_eq!(taken_array.nbuffers(), original_buffers);
135
136        // Now optimize the taken array
137        let optimized_array = taken_array.compact_buffers().unwrap();
138
139        // The optimized array should have compacted buffers
140        // Since both remaining strings are short, they should be inlined
141        // so we might have 0 buffers, or 1 buffer if any were not inlined
142        assert!(optimized_array.nbuffers() <= 1);
143
144        // Verify the data is still correct
145        assert_eq!(optimized_array.len(), 2);
146        assert_eq!(optimized_array.scalar_at(0).unwrap(), "short".into());
147        assert_eq!(optimized_array.scalar_at(1).unwrap(), "tiny".into());
148    }
149
150    #[test]
151    fn test_optimize_with_long_strings() {
152        // Create strings that are definitely longer than 12 bytes
153        let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
154        let long_string_2 = "another extremely long string that also needs external buffer storage";
155        let long_string_3 = "yet another long string for testing buffer compaction functionality";
156
157        let original = VarBinViewArray::from_iter_str([
158            long_string_1,
159            long_string_2,
160            long_string_3,
161            "short1",
162            "short2",
163        ]);
164
165        // Take only the first and third long strings (indices 0 and 2)
166        let indices = buffer![0u32, 2u32].into_array();
167        let taken = take(original.as_ref(), &indices).unwrap();
168        let taken_array = taken.as_::<VarBinViewVTable>();
169
170        // Optimize the taken array
171        let optimized_array = taken_array.compact_buffers().unwrap();
172
173        // The optimized array should have exactly 1 buffer (consolidated)
174        assert_eq!(optimized_array.nbuffers(), 1);
175
176        // Verify the data is still correct
177        assert_eq!(optimized_array.len(), 2);
178        assert_eq!(optimized_array.scalar_at(0).unwrap(), long_string_1.into());
179        assert_eq!(optimized_array.scalar_at(1).unwrap(), long_string_3.into());
180    }
181
182    #[test]
183    fn test_optimize_no_buffers() {
184        // Create an array with only short strings (all inlined)
185        let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
186
187        // This should have no buffers
188        assert_eq!(original.nbuffers(), 0);
189
190        // Optimize should return the same array
191        let optimized_array = original.compact_buffers().unwrap();
192
193        assert_eq!(optimized_array.nbuffers(), 0);
194        assert_eq!(optimized_array.len(), 4);
195
196        // Verify all values are preserved
197        for i in 0..4 {
198            assert_eq!(
199                optimized_array.scalar_at(i).unwrap(),
200                original.scalar_at(i).unwrap()
201            );
202        }
203    }
204
205    #[test]
206    fn test_optimize_single_buffer() {
207        // Create an array that naturally has only one buffer
208        let str1 = "this is a long string that goes into a buffer";
209        let str2 = "another long string in the same buffer";
210        let original = VarBinViewArray::from_iter_str([str1, str2]);
211
212        // Should have 1 compact buffer
213        assert_eq!(original.nbuffers(), 1);
214        assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
215
216        // Optimize should return the same array (no change needed)
217        let optimized_array = original.compact_buffers().unwrap();
218
219        assert_eq!(optimized_array.nbuffers(), 1);
220        assert_eq!(optimized_array.len(), 2);
221
222        // Verify all values are preserved
223        for i in 0..2 {
224            assert_eq!(
225                optimized_array.scalar_at(i).unwrap(),
226                original.scalar_at(i).unwrap()
227            );
228        }
229    }
230}