vortex_array/arrays/varbinview/
compact.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Defines a compaction operation for VarBinViewArrays that evicts unused buffers so they can
5//! be dropped.
6
7use std::ops::Range;
8
9use vortex_error::{VortexExpect, VortexResult};
10
11use crate::arrays::VarBinViewArray;
12use crate::builders::{ArrayBuilder, VarBinViewBuilder};
13use crate::validity::Validity;
14use crate::vtable::ValidityHelper;
15
16impl VarBinViewArray {
17    /// Returns a compacted copy of the input array, where all wasted space has been cleaned up. This
18    /// operation can be very expensive, in the worst case copying all existing string data into
19    /// a new allocation.
20    ///
21    /// After slicing/taking operations `VarBinViewArray`s can continue to hold references to buffers
22    /// that are no longer visible. We detect when there is wasted space in any of the buffers, and if
23    /// so, will aggressively compact all visible outlined string data into new buffers while keeping
24    /// well-utilized buffers unchanged.
25    pub fn compact_buffers(&self) -> VortexResult<VarBinViewArray> {
26        // If there is nothing to be gained by compaction, return the original array untouched.
27        if !self.should_compact() {
28            return Ok(self.clone());
29        }
30
31        // Use selective compaction with threshold of 1.0 (compact any buffer with any waste)
32        self.compact_with_threshold(1.0)
33    }
34
35    fn should_compact(&self) -> bool {
36        let nbuffers = self.nbuffers();
37
38        // If the array is entirely inlined strings, do not attempt to compact.
39        if nbuffers == 0 {
40            return false;
41        }
42
43        // These will fail to write, so in most cases we want to compact this.
44        if nbuffers > u16::MAX as usize {
45            return true;
46        }
47
48        let bytes_referenced: u64 = self.count_referenced_bytes();
49        let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
50
51        // If there is any wasted space, we want to repack.
52        // This is very aggressive.
53        bytes_referenced < buffer_total_bytes || buffer_total_bytes == 0
54    }
55
56    // count the number of bytes addressed by the views, not including null
57    // values or any inlined strings.
58    fn count_referenced_bytes(&self) -> u64 {
59        match self.validity() {
60            Validity::AllInvalid => 0u64,
61            Validity::NonNullable | Validity::AllValid | Validity::Array(_) => self
62                .views()
63                .iter()
64                .enumerate()
65                .map(|(idx, &view)| {
66                    if !self.is_valid(idx) || view.is_inlined() {
67                        0u64
68                    } else {
69                        view.len() as u64
70                    }
71                })
72                .sum(),
73        }
74    }
75
76    pub(crate) fn buffer_utilizations(&self) -> Vec<BufferUtilization> {
77        let mut utilizations = self
78            .buffers()
79            .iter()
80            .map(|buf| {
81                let len = u32::try_from(buf.len()).vortex_expect("buffer sizes must fit in u32");
82                BufferUtilization::zero(len)
83            })
84            .collect();
85
86        if matches!(self.validity(), Validity::AllInvalid) {
87            return utilizations;
88        }
89
90        for (idx, &view) in self.views().iter().enumerate() {
91            if !self.is_valid(idx) || view.is_inlined() {
92                continue;
93            }
94            let view = view.as_view();
95
96            utilizations[view.buffer_index as usize].add(view.offset, view.size)
97        }
98
99        utilizations
100    }
101
102    /// Returns a compacted copy of the input array using selective buffer compaction.
103    ///
104    /// This method analyzes each buffer's utilization and applies one of three strategies:
105    /// - **KeepFull** (zero-copy): Well-utilized buffers are kept unchanged
106    /// - **Slice** (zero-copy): Buffers with contiguous ranges of used data are sliced to that range
107    /// - **Rewrite**: Poorly-utilized buffers have their data copied to new compact buffers
108    ///
109    /// By preserving or slicing well-utilized buffers, compaction becomes zero-copy in many cases.
110    ///
111    /// # Arguments
112    ///
113    /// * `buffer_utilization_threshold` - Threshold in range [0, 1]. Buffers with utilization
114    ///   below this value will be compacted. Use 0.0 for no compaction, 1.0 for aggressive
115    ///   compaction of any buffer with wasted space.
116    pub fn compact_with_threshold(
117        &self,
118        buffer_utilization_threshold: f64, // [0, 1]
119    ) -> VortexResult<VarBinViewArray> {
120        let mut builder = VarBinViewBuilder::with_compaction(
121            self.dtype().clone(),
122            self.len(),
123            buffer_utilization_threshold,
124        );
125        builder.extend_from_array(self.as_ref());
126        Ok(builder.finish_into_varbinview())
127    }
128}
129
130pub(crate) struct BufferUtilization {
131    len: u32,
132    used: u32,
133    min_offset: u32,
134    max_offset_end: u32,
135}
136
137impl BufferUtilization {
138    fn zero(len: u32) -> Self {
139        BufferUtilization {
140            len,
141            used: 0u32,
142            min_offset: u32::MAX,
143            max_offset_end: 0,
144        }
145    }
146
147    fn add(&mut self, offset: u32, size: u32) {
148        self.used += size;
149        self.min_offset = self.min_offset.min(offset);
150        self.max_offset_end = self.max_offset_end.max(offset + size);
151    }
152
153    pub fn overall_utilization(&self) -> f64 {
154        match self.len {
155            0 => 0.0,
156            len => self.used as f64 / len as f64,
157        }
158    }
159
160    pub fn range_utilization(&self) -> f64 {
161        match self.range_span() {
162            0 => 0.0,
163            span => self.used as f64 / span as f64,
164        }
165    }
166
167    pub fn range(&self) -> Range<u32> {
168        self.min_offset..self.max_offset_end
169    }
170
171    fn range_span(&self) -> u32 {
172        self.max_offset_end.saturating_sub(self.min_offset)
173    }
174}
175
176#[cfg(test)]
177mod tests {
178    use vortex_buffer::buffer;
179
180    use crate::arrays::{VarBinArray, VarBinViewArray, VarBinViewVTable};
181    use crate::compute::take;
182    use crate::{IntoArray, assert_arrays_eq};
183
184    #[test]
185    fn test_optimize_compacts_buffers() {
186        // Create a VarBinViewArray with some long strings that will create multiple buffers
187        let original = VarBinViewArray::from_iter_nullable_str([
188            Some("short"),
189            Some("this is a longer string that will be stored in a buffer"),
190            Some("medium length string"),
191            Some("another very long string that definitely needs a buffer to store it"),
192            Some("tiny"),
193        ]);
194
195        // Verify it has buffers
196        assert!(original.nbuffers() > 0);
197        let original_buffers = original.nbuffers();
198
199        // Take only the first and last elements (indices 0 and 4)
200        let indices = buffer![0u32, 4u32].into_array();
201        let taken = take(original.as_ref(), &indices).unwrap();
202        let taken_array = taken.as_::<VarBinViewVTable>();
203
204        // The taken array should still have the same number of buffers
205        assert_eq!(taken_array.nbuffers(), original_buffers);
206
207        // Now optimize the taken array
208        let optimized_array = taken_array.compact_buffers().unwrap();
209
210        // The optimized array should have compacted buffers
211        // Since both remaining strings are short, they should be inlined
212        // so we might have 0 buffers, or 1 buffer if any were not inlined
213        assert!(optimized_array.nbuffers() <= 1);
214
215        // Verify the data is still correct
216        assert_arrays_eq!(
217            optimized_array,
218            <VarBinArray as FromIterator<_>>::from_iter([Some("short"), Some("tiny")])
219        );
220    }
221
222    #[test]
223    fn test_optimize_with_long_strings() {
224        // Create strings that are definitely longer than 12 bytes
225        let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
226        let long_string_2 = "another extremely long string that also needs external buffer storage";
227        let long_string_3 = "yet another long string for testing buffer compaction functionality";
228
229        let original = VarBinViewArray::from_iter_str([
230            long_string_1,
231            long_string_2,
232            long_string_3,
233            "short1",
234            "short2",
235        ]);
236
237        // Take only the first and third long strings (indices 0 and 2)
238        let indices = buffer![0u32, 2u32].into_array();
239        let taken = take(original.as_ref(), &indices).unwrap();
240        let taken_array = taken.as_::<VarBinViewVTable>();
241
242        // Optimize the taken array
243        let optimized_array = taken_array.compact_buffers().unwrap();
244
245        // The optimized array should have exactly 1 buffer (consolidated)
246        assert_eq!(optimized_array.nbuffers(), 1);
247
248        // Verify the data is still correct
249        assert_arrays_eq!(
250            optimized_array,
251            VarBinArray::from(vec![long_string_1, long_string_3])
252        );
253    }
254
255    #[test]
256    fn test_optimize_no_buffers() {
257        // Create an array with only short strings (all inlined)
258        let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
259
260        // This should have no buffers
261        assert_eq!(original.nbuffers(), 0);
262
263        // Optimize should return the same array
264        let optimized_array = original.compact_buffers().unwrap();
265
266        assert_eq!(optimized_array.nbuffers(), 0);
267
268        assert_arrays_eq!(optimized_array, original);
269    }
270
271    #[test]
272    fn test_optimize_single_buffer() {
273        // Create an array that naturally has only one buffer
274        let str1 = "this is a long string that goes into a buffer";
275        let str2 = "another long string in the same buffer";
276        let original = VarBinViewArray::from_iter_str([str1, str2]);
277
278        // Should have 1 compact buffer
279        assert_eq!(original.nbuffers(), 1);
280        assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
281
282        // Optimize should return the same array (no change needed)
283        let optimized_array = original.compact_buffers().unwrap();
284
285        assert_eq!(optimized_array.nbuffers(), 1);
286
287        assert_arrays_eq!(optimized_array, original);
288    }
289
290    #[test]
291    fn test_selective_compaction_with_threshold_zero() {
292        // threshold=0 should keep all buffers (no compaction)
293        let original = VarBinViewArray::from_iter_str([
294            "this is a longer string that will be stored in a buffer",
295            "another very long string that definitely needs a buffer to store it",
296        ]);
297
298        let original_buffers = original.nbuffers();
299        assert!(original_buffers > 0);
300
301        // Take only first element
302        let indices = buffer![0u32].into_array();
303        let taken = take(original.as_ref(), &indices).unwrap();
304        let taken_array = taken.as_::<VarBinViewVTable>();
305
306        // Compact with threshold=0 (should not compact)
307        let compacted = taken_array.compact_with_threshold(0.0).unwrap();
308
309        // Should still have the same number of buffers as the taken array
310        assert_eq!(compacted.nbuffers(), taken_array.nbuffers());
311
312        // Verify correctness
313        assert_arrays_eq!(compacted, taken);
314    }
315
316    #[test]
317    fn test_selective_compaction_with_high_threshold() {
318        // threshold=1.0 should compact any buffer with waste
319        let original = VarBinViewArray::from_iter_str([
320            "this is a longer string that will be stored in a buffer",
321            "another very long string that definitely needs a buffer to store it",
322            "yet another long string",
323        ]);
324
325        // Take only first and last elements
326        let indices = buffer![0u32, 2u32].into_array();
327        let taken = take(original.as_ref(), &indices).unwrap();
328        let taken_array = taken.as_::<VarBinViewVTable>();
329
330        let original_buffers = taken_array.nbuffers();
331
332        // Compact with threshold=1.0 (aggressive compaction)
333        let compacted = taken_array.compact_with_threshold(1.0).unwrap();
334
335        // Should have compacted buffers
336        assert!(compacted.nbuffers() <= original_buffers);
337
338        // Verify correctness
339        assert_arrays_eq!(compacted, taken);
340    }
341
342    #[test]
343    fn test_selective_compaction_preserves_well_utilized_buffers() {
344        // Create an array with multiple strings in one buffer (well-utilized)
345        let str1 = "first long string that needs external buffer storage";
346        let str2 = "second long string also in buffer";
347        let str3 = "third long string in same buffer";
348
349        let original = VarBinViewArray::from_iter_str([str1, str2, str3]);
350
351        // All strings should be in one well-utilized buffer
352        assert_eq!(original.nbuffers(), 1);
353
354        // Compact with high threshold
355        let compacted = original.compact_with_threshold(0.8).unwrap();
356
357        // Well-utilized buffer should be preserved
358        assert_eq!(compacted.nbuffers(), 1);
359
360        // Verify all data is correct
361        assert_arrays_eq!(compacted, original);
362    }
363
364    #[test]
365    fn test_selective_compaction_with_mixed_utilization() {
366        // Create array with some long strings
367        let strings: Vec<String> = (0..10)
368            .map(|i| {
369                format!(
370                    "this is a long string number {} that needs buffer storage",
371                    i
372                )
373            })
374            .collect();
375
376        let original = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str()));
377
378        // Take every other element to create mixed utilization
379        let indices_array = buffer![0u32, 2u32, 4u32, 6u32, 8u32].into_array();
380        let taken = take(original.as_ref(), &indices_array).unwrap();
381        let taken_array = taken.as_::<VarBinViewVTable>();
382
383        // Compact with moderate threshold
384        let compacted = taken_array.compact_with_threshold(0.7).unwrap();
385
386        // Verify correctness
387        assert_arrays_eq!(compacted, taken);
388    }
389
390    #[test]
391    fn test_slice_strategy_with_contiguous_range() {
392        // Create array with strings that will be in one buffer
393        let strings: Vec<String> = (0..20)
394            .map(|i| format!("this is a long string number {} for slice test", i))
395            .collect();
396
397        let original = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str()));
398
399        // Take only the first 5 elements - they should be in a contiguous range at the start
400        let indices_array = buffer![0u32, 1u32, 2u32, 3u32, 4u32].into_array();
401        let taken = take(original.as_ref(), &indices_array).unwrap();
402        let taken_array = taken.as_::<VarBinViewVTable>();
403
404        // Get buffer stats before compaction
405        let utils_before = taken_array.buffer_utilizations();
406        let original_buffer_count = taken_array.nbuffers();
407
408        // Compact with a threshold that should trigger slicing
409        // The range utilization should be high even if overall utilization is low
410        let compacted = taken_array.compact_with_threshold(0.8).unwrap();
411
412        // After compaction, we should still have buffers (sliced, not rewritten)
413        assert!(
414            compacted.nbuffers() > 0,
415            "Should have buffers after slice compaction"
416        );
417
418        // Verify correctness
419        assert_arrays_eq!(&compacted, taken);
420
421        // Verify that if there was only one buffer, the compacted version also has one
422        // (it was sliced, not rewritten into multiple buffers)
423        if original_buffer_count == 1 && utils_before[0].range_utilization() >= 0.8 {
424            assert_eq!(
425                compacted.nbuffers(),
426                1,
427                "Slice strategy should maintain single buffer"
428            );
429        }
430    }
431}