vortex_array/arrays/varbinview/
compact.rs1use std::ops::Range;
8
9use vortex_error::{VortexExpect, VortexResult};
10
11use crate::arrays::VarBinViewArray;
12use crate::builders::{ArrayBuilder, VarBinViewBuilder};
13use crate::validity::Validity;
14use crate::vtable::ValidityHelper;
15
16impl VarBinViewArray {
17 pub fn compact_buffers(&self) -> VortexResult<VarBinViewArray> {
26 if !self.should_compact() {
28 return Ok(self.clone());
29 }
30
31 self.compact_with_threshold(1.0)
33 }
34
35 fn should_compact(&self) -> bool {
36 let nbuffers = self.nbuffers();
37
38 if nbuffers == 0 {
40 return false;
41 }
42
43 if nbuffers > u16::MAX as usize {
45 return true;
46 }
47
48 let bytes_referenced: u64 = self.count_referenced_bytes();
49 let buffer_total_bytes: u64 = self.buffers.iter().map(|buf| buf.len() as u64).sum();
50
51 bytes_referenced < buffer_total_bytes || buffer_total_bytes == 0
54 }
55
56 fn count_referenced_bytes(&self) -> u64 {
59 match self.validity() {
60 Validity::AllInvalid => 0u64,
61 Validity::NonNullable | Validity::AllValid | Validity::Array(_) => self
62 .views()
63 .iter()
64 .enumerate()
65 .map(|(idx, &view)| {
66 if !self.is_valid(idx) || view.is_inlined() {
67 0u64
68 } else {
69 view.len() as u64
70 }
71 })
72 .sum(),
73 }
74 }
75
76 pub(crate) fn buffer_utilizations(&self) -> Vec<BufferUtilization> {
77 let mut utilizations = self
78 .buffers()
79 .iter()
80 .map(|buf| {
81 let len = u32::try_from(buf.len()).vortex_expect("buffer sizes must fit in u32");
82 BufferUtilization::zero(len)
83 })
84 .collect();
85
86 if matches!(self.validity(), Validity::AllInvalid) {
87 return utilizations;
88 }
89
90 for (idx, &view) in self.views().iter().enumerate() {
91 if !self.is_valid(idx) || view.is_inlined() {
92 continue;
93 }
94 let view = view.as_view();
95
96 utilizations[view.buffer_index as usize].add(view.offset, view.size)
97 }
98
99 utilizations
100 }
101
102 pub fn compact_with_threshold(
117 &self,
118 buffer_utilization_threshold: f64, ) -> VortexResult<VarBinViewArray> {
120 let mut builder = VarBinViewBuilder::with_compaction(
121 self.dtype().clone(),
122 self.len(),
123 buffer_utilization_threshold,
124 );
125 builder.extend_from_array(self.as_ref());
126 Ok(builder.finish_into_varbinview())
127 }
128}
129
130pub(crate) struct BufferUtilization {
131 len: u32,
132 used: u32,
133 min_offset: u32,
134 max_offset_end: u32,
135}
136
137impl BufferUtilization {
138 fn zero(len: u32) -> Self {
139 BufferUtilization {
140 len,
141 used: 0u32,
142 min_offset: u32::MAX,
143 max_offset_end: 0,
144 }
145 }
146
147 fn add(&mut self, offset: u32, size: u32) {
148 self.used += size;
149 self.min_offset = self.min_offset.min(offset);
150 self.max_offset_end = self.max_offset_end.max(offset + size);
151 }
152
153 pub fn overall_utilization(&self) -> f64 {
154 match self.len {
155 0 => 0.0,
156 len => self.used as f64 / len as f64,
157 }
158 }
159
160 pub fn range_utilization(&self) -> f64 {
161 match self.range_span() {
162 0 => 0.0,
163 span => self.used as f64 / span as f64,
164 }
165 }
166
167 pub fn range(&self) -> Range<u32> {
168 self.min_offset..self.max_offset_end
169 }
170
171 fn range_span(&self) -> u32 {
172 self.max_offset_end.saturating_sub(self.min_offset)
173 }
174}
175
176#[cfg(test)]
177mod tests {
178 use vortex_buffer::buffer;
179
180 use crate::arrays::{VarBinArray, VarBinViewArray, VarBinViewVTable};
181 use crate::compute::take;
182 use crate::{IntoArray, assert_arrays_eq};
183
184 #[test]
185 fn test_optimize_compacts_buffers() {
186 let original = VarBinViewArray::from_iter_nullable_str([
188 Some("short"),
189 Some("this is a longer string that will be stored in a buffer"),
190 Some("medium length string"),
191 Some("another very long string that definitely needs a buffer to store it"),
192 Some("tiny"),
193 ]);
194
195 assert!(original.nbuffers() > 0);
197 let original_buffers = original.nbuffers();
198
199 let indices = buffer![0u32, 4u32].into_array();
201 let taken = take(original.as_ref(), &indices).unwrap();
202 let taken_array = taken.as_::<VarBinViewVTable>();
203
204 assert_eq!(taken_array.nbuffers(), original_buffers);
206
207 let optimized_array = taken_array.compact_buffers().unwrap();
209
210 assert!(optimized_array.nbuffers() <= 1);
214
215 assert_arrays_eq!(
217 optimized_array,
218 <VarBinArray as FromIterator<_>>::from_iter([Some("short"), Some("tiny")])
219 );
220 }
221
222 #[test]
223 fn test_optimize_with_long_strings() {
224 let long_string_1 = "this is definitely a very long string that exceeds the inline limit";
226 let long_string_2 = "another extremely long string that also needs external buffer storage";
227 let long_string_3 = "yet another long string for testing buffer compaction functionality";
228
229 let original = VarBinViewArray::from_iter_str([
230 long_string_1,
231 long_string_2,
232 long_string_3,
233 "short1",
234 "short2",
235 ]);
236
237 let indices = buffer![0u32, 2u32].into_array();
239 let taken = take(original.as_ref(), &indices).unwrap();
240 let taken_array = taken.as_::<VarBinViewVTable>();
241
242 let optimized_array = taken_array.compact_buffers().unwrap();
244
245 assert_eq!(optimized_array.nbuffers(), 1);
247
248 assert_arrays_eq!(
250 optimized_array,
251 VarBinArray::from(vec![long_string_1, long_string_3])
252 );
253 }
254
255 #[test]
256 fn test_optimize_no_buffers() {
257 let original = VarBinViewArray::from_iter_str(["a", "bb", "ccc", "dddd"]);
259
260 assert_eq!(original.nbuffers(), 0);
262
263 let optimized_array = original.compact_buffers().unwrap();
265
266 assert_eq!(optimized_array.nbuffers(), 0);
267
268 assert_arrays_eq!(optimized_array, original);
269 }
270
271 #[test]
272 fn test_optimize_single_buffer() {
273 let str1 = "this is a long string that goes into a buffer";
275 let str2 = "another long string in the same buffer";
276 let original = VarBinViewArray::from_iter_str([str1, str2]);
277
278 assert_eq!(original.nbuffers(), 1);
280 assert_eq!(original.buffer(0).len(), str1.len() + str2.len());
281
282 let optimized_array = original.compact_buffers().unwrap();
284
285 assert_eq!(optimized_array.nbuffers(), 1);
286
287 assert_arrays_eq!(optimized_array, original);
288 }
289
290 #[test]
291 fn test_selective_compaction_with_threshold_zero() {
292 let original = VarBinViewArray::from_iter_str([
294 "this is a longer string that will be stored in a buffer",
295 "another very long string that definitely needs a buffer to store it",
296 ]);
297
298 let original_buffers = original.nbuffers();
299 assert!(original_buffers > 0);
300
301 let indices = buffer![0u32].into_array();
303 let taken = take(original.as_ref(), &indices).unwrap();
304 let taken_array = taken.as_::<VarBinViewVTable>();
305
306 let compacted = taken_array.compact_with_threshold(0.0).unwrap();
308
309 assert_eq!(compacted.nbuffers(), taken_array.nbuffers());
311
312 assert_arrays_eq!(compacted, taken);
314 }
315
316 #[test]
317 fn test_selective_compaction_with_high_threshold() {
318 let original = VarBinViewArray::from_iter_str([
320 "this is a longer string that will be stored in a buffer",
321 "another very long string that definitely needs a buffer to store it",
322 "yet another long string",
323 ]);
324
325 let indices = buffer![0u32, 2u32].into_array();
327 let taken = take(original.as_ref(), &indices).unwrap();
328 let taken_array = taken.as_::<VarBinViewVTable>();
329
330 let original_buffers = taken_array.nbuffers();
331
332 let compacted = taken_array.compact_with_threshold(1.0).unwrap();
334
335 assert!(compacted.nbuffers() <= original_buffers);
337
338 assert_arrays_eq!(compacted, taken);
340 }
341
342 #[test]
343 fn test_selective_compaction_preserves_well_utilized_buffers() {
344 let str1 = "first long string that needs external buffer storage";
346 let str2 = "second long string also in buffer";
347 let str3 = "third long string in same buffer";
348
349 let original = VarBinViewArray::from_iter_str([str1, str2, str3]);
350
351 assert_eq!(original.nbuffers(), 1);
353
354 let compacted = original.compact_with_threshold(0.8).unwrap();
356
357 assert_eq!(compacted.nbuffers(), 1);
359
360 assert_arrays_eq!(compacted, original);
362 }
363
364 #[test]
365 fn test_selective_compaction_with_mixed_utilization() {
366 let strings: Vec<String> = (0..10)
368 .map(|i| {
369 format!(
370 "this is a long string number {} that needs buffer storage",
371 i
372 )
373 })
374 .collect();
375
376 let original = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str()));
377
378 let indices_array = buffer![0u32, 2u32, 4u32, 6u32, 8u32].into_array();
380 let taken = take(original.as_ref(), &indices_array).unwrap();
381 let taken_array = taken.as_::<VarBinViewVTable>();
382
383 let compacted = taken_array.compact_with_threshold(0.7).unwrap();
385
386 assert_arrays_eq!(compacted, taken);
388 }
389
390 #[test]
391 fn test_slice_strategy_with_contiguous_range() {
392 let strings: Vec<String> = (0..20)
394 .map(|i| format!("this is a long string number {} for slice test", i))
395 .collect();
396
397 let original = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str()));
398
399 let indices_array = buffer![0u32, 1u32, 2u32, 3u32, 4u32].into_array();
401 let taken = take(original.as_ref(), &indices_array).unwrap();
402 let taken_array = taken.as_::<VarBinViewVTable>();
403
404 let utils_before = taken_array.buffer_utilizations();
406 let original_buffer_count = taken_array.nbuffers();
407
408 let compacted = taken_array.compact_with_threshold(0.8).unwrap();
411
412 assert!(
414 compacted.nbuffers() > 0,
415 "Should have buffers after slice compaction"
416 );
417
418 assert_arrays_eq!(&compacted, taken);
420
421 if original_buffer_count == 1 && utils_before[0].range_utilization() >= 0.8 {
424 assert_eq!(
425 compacted.nbuffers(),
426 1,
427 "Slice strategy should maintain single buffer"
428 );
429 }
430 }
431}