vortex_array/arrays/listview/
rebuild.rs1use num_traits::FromPrimitive;
5use vortex_buffer::BufferMut;
6use vortex_error::VortexExpect;
7use vortex_error::VortexResult;
8
9use crate::Array;
10use crate::IntoArray;
11use crate::ToCanonical;
12use crate::arrays::ConstantArray;
13use crate::arrays::ListViewArray;
14use crate::builders::builder_with_capacity;
15use crate::builtins::ArrayBuiltins;
16use crate::compute;
17use crate::dtype::IntegerPType;
18use crate::dtype::Nullability;
19use crate::match_each_integer_ptype;
20use crate::scalar::Scalar;
21use crate::scalar_fn::fns::operators::Operator;
22use crate::vtable::ValidityHelper;
23
24pub enum ListViewRebuildMode {
26 MakeZeroCopyToList,
34
35 TrimElements,
38
39 MakeExact,
46
47 OverlapCompression,
54}
55
56impl ListViewArray {
57 pub fn rebuild(&self, mode: ListViewRebuildMode) -> VortexResult<ListViewArray> {
59 if self.is_empty() {
60 return Ok(self.clone());
61 }
62
63 match mode {
64 ListViewRebuildMode::MakeZeroCopyToList => self.rebuild_zero_copy_to_list(),
65 ListViewRebuildMode::TrimElements => self.rebuild_trim_elements(),
66 ListViewRebuildMode::MakeExact => self.rebuild_make_exact(),
67 ListViewRebuildMode::OverlapCompression => unimplemented!("Does P=NP?"),
68 }
69 }
70
71 fn rebuild_zero_copy_to_list(&self) -> VortexResult<ListViewArray> {
79 if self.is_zero_copy_to_list() {
80 return Ok(self.clone());
82 }
83
84 let offsets_ptype = self.offsets().dtype().as_ptype();
85 let sizes_ptype = self.sizes().dtype().as_ptype();
86
87 match_each_integer_ptype!(sizes_ptype, |S| {
95 match offsets_ptype {
96 PType::U8 => self.naive_rebuild::<u8, u32, S>(),
97 PType::U16 => self.naive_rebuild::<u16, u32, S>(),
98 PType::U32 => self.naive_rebuild::<u32, u32, S>(),
99 PType::U64 => self.naive_rebuild::<u64, u64, S>(),
100 PType::I8 => self.naive_rebuild::<i8, i32, S>(),
101 PType::I16 => self.naive_rebuild::<i16, i32, S>(),
102 PType::I32 => self.naive_rebuild::<i32, i32, S>(),
103 PType::I64 => self.naive_rebuild::<i64, i64, S>(),
104 _ => unreachable!("invalid offsets PType"),
105 }
106 })
107 }
108
109 fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
113 &self,
114 ) -> VortexResult<ListViewArray> {
115 let sizes_canonical = self.sizes().to_primitive();
116 let total: u64 = sizes_canonical
117 .as_slice::<S>()
118 .iter()
119 .map(|s| (*s).as_() as u64)
120 .sum();
121 if Self::should_use_take(total, self.len()) {
122 self.rebuild_with_take::<O, NewOffset, S>()
123 } else {
124 self.rebuild_list_by_list::<O, NewOffset, S>()
125 }
126 }
127
128 fn should_use_take(total_output_elements: u64, num_lists: usize) -> bool {
136 if num_lists == 0 {
137 return true;
138 }
139 let avg = total_output_elements / num_lists as u64;
140 avg < 128
141 }
142
143 fn rebuild_with_take<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
146 &self,
147 ) -> VortexResult<ListViewArray> {
148 let offsets_canonical = self.offsets().to_primitive();
149 let offsets_slice = offsets_canonical.as_slice::<O>();
150 let sizes_canonical = self.sizes().to_primitive();
151 let sizes_slice = sizes_canonical.as_slice::<S>();
152
153 let len = offsets_slice.len();
154
155 let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
156 let mut new_sizes = BufferMut::<S>::with_capacity(len);
157 let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());
158
159 let mut n_elements = NewOffset::zero();
160 for index in 0..len {
161 if !self.is_valid(index)? {
162 new_offsets.push(n_elements);
163 new_sizes.push(S::zero());
164 continue;
165 }
166
167 let offset = offsets_slice[index];
168 let size = sizes_slice[index];
169 let start = offset.as_();
170 let stop = start + size.as_();
171
172 new_offsets.push(n_elements);
173 new_sizes.push(size);
174 take_indices.extend(start as u64..stop as u64);
175 n_elements += num_traits::cast(size).vortex_expect("Cast failed");
176 }
177
178 let elements = self.elements().take(take_indices.into_array())?;
179 let offsets = new_offsets.into_array();
180 let sizes = new_sizes.into_array();
181
182 Ok(unsafe {
186 ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone())
187 .with_zero_copy_to_list(true)
188 })
189 }
190
191 fn rebuild_list_by_list<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
194 &self,
195 ) -> VortexResult<ListViewArray> {
196 let element_dtype = self
197 .dtype()
198 .as_list_element_opt()
199 .vortex_expect("somehow had a canonical list that was not a list");
200
201 let offsets_canonical = self.offsets().to_primitive();
202 let offsets_slice = offsets_canonical.as_slice::<O>();
203 let sizes_canonical = self.sizes().to_primitive();
204 let sizes_slice = sizes_canonical.as_slice::<S>();
205
206 let len = offsets_slice.len();
207
208 let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
209 let mut new_sizes = BufferMut::<S>::with_capacity(len);
214
215 let elements_canonical = self
217 .elements()
218 .to_canonical()
219 .vortex_expect("canonicalize elements for rebuild")
220 .into_array();
221
222 let mut new_elements_builder =
225 builder_with_capacity(element_dtype.as_ref(), self.elements().len());
226
227 let mut n_elements = NewOffset::zero();
228 for index in 0..len {
229 if !self.is_valid(index)? {
230 new_offsets.push(n_elements);
233 new_sizes.push(S::zero());
234 continue;
235 }
236
237 let offset = offsets_slice[index];
238 let size = sizes_slice[index];
239
240 let start = offset.as_();
241 let stop = start + size.as_();
242
243 new_offsets.push(n_elements);
244 new_sizes.push(size);
245 new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?);
246
247 n_elements += num_traits::cast(size).vortex_expect("Cast failed");
248 }
249
250 let offsets = new_offsets.into_array();
251 let sizes = new_sizes.into_array();
252 let elements = new_elements_builder.finish();
253
254 debug_assert_eq!(
255 n_elements.as_(),
256 elements.len(),
257 "The accumulated elements somehow had the wrong length"
258 );
259
260 Ok(unsafe {
269 ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone())
270 .with_zero_copy_to_list(true)
271 })
272 }
273
274 fn rebuild_trim_elements(&self) -> VortexResult<ListViewArray> {
278 let start = if self.is_zero_copy_to_list() {
279 self.offset_at(0)
283 } else {
284 self.offsets().statistics().compute_min().vortex_expect(
285 "[ListViewArray::rebuild]: `offsets` must report min statistic that is a `usize`",
286 )
287 };
288
289 let end = if self.is_zero_copy_to_list() {
290 let last_offset = self.offset_at(self.len() - 1);
293 let last_size = self.size_at(self.len() - 1);
294 last_offset + last_size
295 } else {
296 let min_max = compute::min_max(
297 &self
298 .offsets()
299 .clone()
300 .binary(self.sizes().clone(), Operator::Add)
301 .vortex_expect("`offsets + sizes` somehow overflowed"),
302 )
303 .vortex_expect("Something went wrong while computing min and max")
304 .vortex_expect("We checked that the array was not empty in the top-level `rebuild`");
305
306 min_max
307 .max
308 .as_primitive()
309 .as_::<usize>()
310 .vortex_expect("unable to interpret the max `offset + size` as a `usize`")
311 };
312
313 let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| {
314 let offset = <O as FromPrimitive>::from_usize(start)
315 .vortex_expect("unable to convert the min offset `start` into a `usize`");
316 let scalar = Scalar::primitive(offset, Nullability::NonNullable);
317
318 self.offsets()
319 .to_array()
320 .binary(
321 ConstantArray::new(scalar, self.offsets().len()).into_array(),
322 Operator::Sub,
323 )
324 .vortex_expect("was somehow unable to adjust offsets down by their minimum")
325 });
326
327 let sliced_elements = self.elements().slice(start..end)?;
328
329 Ok(unsafe {
334 ListViewArray::new_unchecked(
335 sliced_elements,
336 adjusted_offsets,
337 self.sizes().clone(),
338 self.validity().clone(),
339 )
340 .with_zero_copy_to_list(self.is_zero_copy_to_list())
341 })
342 }
343
344 fn rebuild_make_exact(&self) -> VortexResult<ListViewArray> {
345 if self.is_zero_copy_to_list() {
346 self.rebuild_trim_elements()
347 } else {
348 self.rebuild_zero_copy_to_list()
351 }
352 }
353}
354
355#[cfg(test)]
356#[allow(clippy::cast_possible_truncation)]
357mod tests {
358 use vortex_buffer::BitBuffer;
359 use vortex_error::VortexResult;
360
361 use super::ListViewRebuildMode;
362 use crate::IntoArray;
363 use crate::ToCanonical;
364 use crate::arrays::ListViewArray;
365 use crate::arrays::PrimitiveArray;
366 use crate::assert_arrays_eq;
367 use crate::dtype::Nullability;
368 use crate::validity::Validity;
369 use crate::vtable::ValidityHelper;
370
371 #[test]
372 fn test_rebuild_flatten_removes_overlaps() -> VortexResult<()> {
373 let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
377 let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
378 let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
379
380 let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
381
382 let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
383
384 assert_eq!(flattened.elements().len(), 5);
387
388 assert_eq!(flattened.offset_at(0), 0);
390 assert_eq!(flattened.size_at(0), 3);
391 assert_eq!(flattened.offset_at(1), 3);
392 assert_eq!(flattened.size_at(1), 2);
393
394 assert_arrays_eq!(
396 flattened.list_elements_at(0).unwrap(),
397 PrimitiveArray::from_iter([1i32, 2, 3])
398 );
399
400 assert_arrays_eq!(
401 flattened.list_elements_at(1).unwrap(),
402 PrimitiveArray::from_iter([2i32, 3])
403 );
404 Ok(())
405 }
406
407 #[test]
408 fn test_rebuild_flatten_with_nullable() -> VortexResult<()> {
409 use crate::arrays::BoolArray;
410
411 let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
413 let offsets = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
414 let sizes = PrimitiveArray::from_iter(vec![2u32, 1, 1]).into_array();
415 let validity = Validity::Array(
416 BoolArray::new(
417 BitBuffer::from(vec![true, false, true]),
418 Validity::NonNullable,
419 )
420 .into_array(),
421 );
422
423 let listview = ListViewArray::new(elements, offsets, sizes, validity);
424
425 let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
426
427 assert_eq!(flattened.dtype().nullability(), Nullability::Nullable);
429 assert!(flattened.validity().is_valid(0).unwrap());
430 assert!(!flattened.validity().is_valid(1).unwrap());
431 assert!(flattened.validity().is_valid(2).unwrap());
432
433 assert_arrays_eq!(
435 flattened.list_elements_at(0).unwrap(),
436 PrimitiveArray::from_iter([1i32, 2])
437 );
438
439 assert_arrays_eq!(
440 flattened.list_elements_at(2).unwrap(),
441 PrimitiveArray::from_iter([3i32])
442 );
443 Ok(())
444 }
445
446 #[test]
447 fn test_rebuild_trim_elements_basic() -> VortexResult<()> {
448 let elements =
456 PrimitiveArray::from_iter(vec![99i32, 98, 1, 2, 97, 3, 4, 96, 95]).into_array();
457 let offsets = PrimitiveArray::from_iter(vec![2u32, 5]).into_array();
458 let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
459
460 let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
461
462 let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements)?;
463
464 assert_eq!(trimmed.elements().len(), 5);
466
467 assert_eq!(trimmed.offset_at(0), 0);
469 assert_eq!(trimmed.size_at(0), 2);
470 assert_eq!(trimmed.offset_at(1), 3);
471 assert_eq!(trimmed.size_at(1), 2);
472
473 assert_arrays_eq!(
475 trimmed.list_elements_at(0).unwrap(),
476 PrimitiveArray::from_iter([1i32, 2])
477 );
478
479 assert_arrays_eq!(
480 trimmed.list_elements_at(1).unwrap(),
481 PrimitiveArray::from_iter([3i32, 4])
482 );
483
484 let all_elements = trimmed.elements().to_primitive();
486 assert_eq!(all_elements.scalar_at(2).unwrap(), 97i32.into());
487 Ok(())
488 }
489
490 #[test]
491 fn test_rebuild_with_trailing_nulls_regression() -> VortexResult<()> {
492 let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3, 4]).into_array();
498 let offsets = PrimitiveArray::from_iter(vec![0u32, 2, 0, 0]).into_array();
499 let sizes = PrimitiveArray::from_iter(vec![2u32, 2, 0, 0]).into_array();
500 let validity = Validity::from_iter(vec![true, true, false, false]);
501
502 let listview = ListViewArray::new(elements, offsets, sizes, validity);
503
504 let rebuilt = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?;
506 assert!(rebuilt.is_zero_copy_to_list());
507
508 assert_eq!(rebuilt.offset_at(0), 0);
511 assert_eq!(rebuilt.offset_at(1), 2);
512 assert_eq!(rebuilt.offset_at(2), 4); assert_eq!(rebuilt.offset_at(3), 4); assert_eq!(rebuilt.size_at(0), 2);
517 assert_eq!(rebuilt.size_at(1), 2);
518 assert_eq!(rebuilt.size_at(2), 0); assert_eq!(rebuilt.size_at(3), 0); let exact = rebuilt.rebuild(ListViewRebuildMode::MakeExact)?;
524
525 assert!(exact.is_valid(0).unwrap());
527 assert!(exact.is_valid(1).unwrap());
528 assert!(!exact.is_valid(2).unwrap());
529 assert!(!exact.is_valid(3).unwrap());
530
531 assert_arrays_eq!(
533 exact.list_elements_at(0).unwrap(),
534 PrimitiveArray::from_iter([1i32, 2])
535 );
536
537 assert_arrays_eq!(
538 exact.list_elements_at(1).unwrap(),
539 PrimitiveArray::from_iter([3i32, 4])
540 );
541 Ok(())
542 }
543
544 #[test]
547 fn heuristic_zero_lists_uses_take() {
548 assert!(ListViewArray::should_use_take(0, 0));
549 }
550
551 #[test]
552 fn heuristic_small_lists_use_take() {
553 assert!(ListViewArray::should_use_take(127_000, 1_000));
555 assert!(!ListViewArray::should_use_take(128_000, 1_000));
557 }
558}