vortex_array/arrays/listview/
rebuild.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use vortex_dtype::{IntegerPType, Nullability, match_each_integer_ptype};
5use vortex_error::VortexExpect;
6
7use crate::arrays::ListViewArray;
8use crate::builders::{ArrayBuilder, ListViewBuilder, PrimitiveBuilder, builder_with_capacity};
9use crate::vtable::ValidityHelper;
10use crate::{Array, IntoArray, ToCanonical};
11
12/// Modes for rebuilding a [`ListViewArray`].
13pub enum ListViewRebuildMode {
14    /// Removes all unused data and flattens out all list data, such that the array is zero-copyable
15    /// to a [`ListArray`].
16    ///
17    /// This mode will deduplicate all overlapping list views, such that the [`ListViewArray`] looks
18    /// like a [`ListArray`] but with an additional `sizes` array.
19    ///
20    /// [`ListArray`]: crate::arrays::ListArray
21    MakeZeroCopyToList,
22
23    /// Removes any unused data from the underlying `elements` array.
24    ///
25    /// This mode will rebuild the `elements` array in the process, but it will keep any overlapping
26    /// lists if they exist.
27    RemoveGaps,
28
29    // TODO(connor)[ListView]: Implement some version of this.
30    /// Finds the shortest packing / overlapping of list elements.
31    ///
32    /// This problem is known to be NP-hard, so maybe when someone proves that P=NP we can implement
33    /// this algorithm (but in all seriousness there are many approximate algorithms that could
34    /// work well here).
35    OverlapCompression,
36}
37
38impl ListViewArray {
39    /// Rebuilds the [`ListViewArray`] according to the specified mode.
40    pub fn rebuild(&self, mode: ListViewRebuildMode) -> ListViewArray {
41        match mode {
42            ListViewRebuildMode::RemoveGaps => self.rebuild_remove_gaps(),
43            ListViewRebuildMode::MakeZeroCopyToList => self.rebuild_zero_copy_to_list(),
44            ListViewRebuildMode::OverlapCompression => unimplemented!("Does P=NP?"),
45        }
46    }
47
48    /// Rebuilds a [`ListViewArray`], removing all data overlaps and creating a flattened layout.
49    ///
50    /// This is useful when the `elements` child array of the [`ListViewArray`] might have
51    /// overlapping, duplicate, and garbage data, and we want to have fully sequential data like
52    /// a [`ListArray`].
53    ///
54    /// [`ListArray`]: crate::arrays::ListArray
55    fn rebuild_zero_copy_to_list(&self) -> ListViewArray {
56        let element_dtype = self
57            .dtype()
58            .as_list_element_opt()
59            .vortex_expect("somehow had a canonical list that was not a list");
60
61        let offsets_ptype = self.offsets().dtype().as_ptype();
62        let sizes_ptype = self.sizes().dtype().as_ptype();
63
64        match_each_integer_ptype!(offsets_ptype, |O| {
65            match_each_integer_ptype!(sizes_ptype, |S| {
66                let mut builder = ListViewBuilder::<O, S>::with_capacity(
67                    element_dtype.clone(),
68                    self.dtype().nullability(),
69                    self.elements().len(),
70                    self.len(),
71                );
72
73                builder.extend_from_array(self.as_ref());
74                builder.finish_into_listview()
75            })
76        })
77    }
78
79    /// Rebuilds a [`ListViewArray`] by removing unreferenced elements while preserving overlaps.
80    ///
81    /// This removes "garbage" elements that are not referenced by any list. Unlike
82    /// [`rebuild_flatten()`], this preserves overlap structure where multiple lists may reference
83    /// the same elements.
84    ///
85    /// [`rebuild_flatten()`]: Self::rebuild_flatten
86    fn rebuild_remove_gaps(&self) -> ListViewArray {
87        if self.is_empty() {
88            return self.clone();
89        }
90
91        let offset_ptype = self.offsets().dtype().as_ptype();
92        let size_ptype = self.sizes().dtype().as_ptype();
93
94        match_each_integer_ptype!(offset_ptype, |O| {
95            match_each_integer_ptype!(size_ptype, |S| { self.rebuild_remove_gaps_inner::<O, S>() })
96        })
97    }
98
99    fn rebuild_remove_gaps_inner<O, S>(&self) -> ListViewArray
100    where
101        O: IntegerPType,
102        S: IntegerPType,
103    {
104        let offsets_primitive = self.offsets().to_primitive();
105        let sizes_primitive = self.sizes().to_primitive();
106        let offsets_slice = offsets_primitive.as_slice::<O>();
107        let sizes_slice = sizes_primitive.as_slice::<S>();
108
109        // Mark which elements in the elements array are referenced by at least one list.
110        let mut referenced_elements = vec![false; self.elements().len()];
111        for i in 0..self.len() {
112            let offset: usize = offsets_slice[i].as_();
113            let size: usize = sizes_slice[i].as_();
114            for j in offset..offset + size {
115                referenced_elements[j] = true;
116            }
117        }
118
119        // Fast path: if all elements are referenced, no garbage collection needed.
120        if referenced_elements
121            .iter()
122            .all(|&is_referenced| is_referenced)
123        {
124            return self.clone();
125        }
126
127        // Build a prefix sum that maps each old element index to its new compacted index.
128        // For example, if elements [0, 2, 3] are used, the prefix sum maps: 0->0, 1->1, 2->1, 3->2.
129        let mut prefix_sum = vec![0; self.elements().len()];
130        let mut cumulative_sum = 0;
131        for i in 0..referenced_elements.len() {
132            prefix_sum[i] = cumulative_sum;
133            if referenced_elements[i] {
134                cumulative_sum += 1;
135            }
136        }
137
138        // Copy only the referenced elements into a new compacted elements array.
139        let mut elements_builder = builder_with_capacity(self.elements().dtype(), cumulative_sum);
140        for (i, &is_referenced) in referenced_elements.iter().enumerate() {
141            if is_referenced {
142                elements_builder
143                    .append_scalar(&self.elements().scalar_at(i))
144                    .vortex_expect("append scalar");
145            }
146        }
147
148        // Remap each old offset to its corresponding new offset in the compacted array.
149        let mut new_offsets_builder =
150            PrimitiveBuilder::<O>::with_capacity(Nullability::NonNullable, self.len());
151        for &old_offset in offsets_slice {
152            let new_offset = prefix_sum[old_offset.as_()];
153            let offset_value =
154                O::from_usize(new_offset).vortex_expect("offset must fit in offset type");
155            new_offsets_builder.append_value(offset_value);
156        }
157
158        // SAFETY: The new offsets array is guaranteed to be valid because:
159        // 1. Each new offset is derived from prefix_sum[old_offset], which maps to a valid index
160        //    in the compacted elements array (guaranteed by the prefix sum construction).
161        // 2. The sizes array is unchanged, so each (new_offset, size) pair still describes a
162        //    valid range within the compacted elements array.
163        // 3. The validity is unchanged and still matches the array length.
164        unsafe {
165            ListViewArray::new_unchecked(
166                elements_builder.finish(),
167                new_offsets_builder.finish().into_array(),
168                self.sizes().clone(),
169                self.validity().clone(),
170            )
171        }
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use vortex_dtype::Nullability;
178
179    use crate::arrays::{ListViewArray, PrimitiveArray};
180    use crate::validity::Validity;
181    use crate::vtable::ValidityHelper;
182    use crate::{IntoArray, ToCanonical};
183
184    #[test]
185    fn test_rebuild_remove_gaps_with_leading_garbage() {
186        // Create a list view with garbage at the beginning: [_, _, A, B, C]
187        // List 0: offset=2, size=2 -> [A, B]
188        // List 1: offset=3, size=2 -> [B, C]
189        let elements = PrimitiveArray::from_iter(vec![99i32, 98, 1, 2, 3]).into_array();
190        let offsets = PrimitiveArray::from_iter(vec![2u32, 3]).into_array();
191        let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
192
193        let listview =
194            ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable).unwrap();
195
196        let compacted = listview.rebuild_remove_gaps();
197
198        // After GC: elements should be [A, B, C] = [1, 2, 3]
199        assert_eq!(compacted.elements().len(), 3);
200
201        // Offsets should be remapped: old offset 2 -> new offset 0, old offset 3 -> new offset 1
202        assert_eq!(compacted.offset_at(0), 0);
203        assert_eq!(compacted.size_at(0), 2);
204        assert_eq!(compacted.offset_at(1), 1);
205        assert_eq!(compacted.size_at(1), 2);
206
207        // Verify the data is correct by reading the lists
208        let list0 = compacted.list_elements_at(0).to_primitive();
209        assert_eq!(list0.as_slice::<i32>(), &[1, 2]);
210
211        let list1 = compacted.list_elements_at(1).to_primitive();
212        assert_eq!(list1.as_slice::<i32>(), &[2, 3]);
213    }
214
215    #[test]
216    fn test_rebuild_remove_gaps_with_trailing_garbage() {
217        // Create a list view with garbage at the end: [A, B, C, _, _]
218        // List 0: offset=0, size=2 -> [A, B]
219        // List 1: offset=1, size=2 -> [B, C]
220        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3, 99, 98]).into_array();
221        let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
222        let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
223
224        let listview =
225            ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable).unwrap();
226
227        let compacted = listview.rebuild_remove_gaps();
228
229        // After GC: elements should be [A, B, C] = [1, 2, 3]
230        assert_eq!(compacted.elements().len(), 3);
231
232        // Offsets should remain the same since no leading garbage
233        assert_eq!(compacted.offset_at(0), 0);
234        assert_eq!(compacted.offset_at(1), 1);
235    }
236
237    #[test]
238    fn test_rebuild_remove_gaps_no_garbage() {
239        // Create a list view with no garbage: [A, B, C]
240        // List 0: offset=0, size=2 -> [A, B]
241        // List 1: offset=2, size=1 -> [C]
242        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
243        let offsets = PrimitiveArray::from_iter(vec![0u32, 2]).into_array();
244        let sizes = PrimitiveArray::from_iter(vec![2u32, 1]).into_array();
245
246        let listview =
247            ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable).unwrap();
248
249        let compacted = listview.rebuild_remove_gaps();
250
251        // Should be unchanged (fast path)
252        assert_eq!(compacted.elements().len(), 3);
253        assert_eq!(compacted.offset_at(0), 0);
254        assert_eq!(compacted.offset_at(1), 2);
255    }
256
257    #[test]
258    fn test_rebuild_remove_gaps_preserves_overlaps() {
259        // Create a list view with overlapping lists and garbage: [_, A, B, C, _]
260        // List 0: offset=1, size=3 -> [A, B, C]
261        // List 1: offset=2, size=2 -> [B, C] (overlaps with List 0)
262        let elements = PrimitiveArray::from_iter(vec![99i32, 1, 2, 3, 98]).into_array();
263        let offsets = PrimitiveArray::from_iter(vec![1u32, 2]).into_array();
264        let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
265
266        let listview =
267            ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable).unwrap();
268
269        let compacted = listview.rebuild_remove_gaps();
270
271        // After GC: elements should be [A, B, C] = [1, 2, 3]
272        assert_eq!(compacted.elements().len(), 3);
273
274        // Offsets should be remapped but still overlapping:
275        // old offset 1 -> new offset 0, old offset 2 -> new offset 1
276        assert_eq!(compacted.offset_at(0), 0);
277        assert_eq!(compacted.size_at(0), 3);
278        assert_eq!(compacted.offset_at(1), 1);
279        assert_eq!(compacted.size_at(1), 2);
280
281        // Verify both lists still overlap and contain correct data
282        let list0 = compacted.list_elements_at(0).to_primitive();
283        assert_eq!(list0.as_slice::<i32>(), &[1, 2, 3]);
284
285        let list1 = compacted.list_elements_at(1).to_primitive();
286        assert_eq!(list1.as_slice::<i32>(), &[2, 3]);
287    }
288
289    #[test]
290    fn test_rebuild_remove_gaps_multiple_gaps() {
291        // Create a list view with multiple gaps:
292        // [_, A, B, _, C, D, _, E, F, _]
293        //  0  1  2  3  4  5  6  7  8  9
294        // List 0: offset=1, size=2 -> [A, B]
295        // List 1: offset=4, size=2 -> [C, D]
296        // List 2: offset=7, size=2 -> [E, F]
297        // Gaps at: 0, 3, 6, 9
298        let elements =
299            PrimitiveArray::from_iter(vec![99i32, 1, 2, 98, 3, 4, 97, 5, 6, 96]).into_array();
300        let offsets = PrimitiveArray::from_iter(vec![1u32, 4, 7]).into_array();
301        let sizes = PrimitiveArray::from_iter(vec![2u32, 2, 2]).into_array();
302
303        let listview =
304            ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable).unwrap();
305
306        let compacted = listview.rebuild_remove_gaps();
307
308        // After GC: elements should be [A, B, C, D, E, F] = [1, 2, 3, 4, 5, 6]
309        assert_eq!(compacted.elements().len(), 6);
310
311        // Verify offset remapping:
312        // old offset 1 -> new offset 0 (after removing leading gap)
313        // old offset 4 -> new offset 2 (after removing gaps at 0, 3)
314        // old offset 7 -> new offset 4 (after removing gaps at 0, 3, 6)
315        assert_eq!(compacted.offset_at(0), 0);
316        assert_eq!(compacted.size_at(0), 2);
317        assert_eq!(compacted.offset_at(1), 2);
318        assert_eq!(compacted.size_at(1), 2);
319        assert_eq!(compacted.offset_at(2), 4);
320        assert_eq!(compacted.size_at(2), 2);
321
322        // Verify the data is correct
323        let list0 = compacted.list_elements_at(0).to_primitive();
324        assert_eq!(list0.as_slice::<i32>(), &[1, 2]);
325
326        let list1 = compacted.list_elements_at(1).to_primitive();
327        assert_eq!(list1.as_slice::<i32>(), &[3, 4]);
328
329        let list2 = compacted.list_elements_at(2).to_primitive();
330        assert_eq!(list2.as_slice::<i32>(), &[5, 6]);
331    }
332
333    #[test]
334    fn test_rebuild_flatten_removes_overlaps() {
335        // Create a list view with overlapping lists: [A, B, C]
336        // List 0: offset=0, size=3 -> [A, B, C]
337        // List 1: offset=1, size=2 -> [B, C] (overlaps with List 0)
338        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
339        let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
340        let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
341
342        let listview =
343            ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable).unwrap();
344
345        let flattened = listview.rebuild_zero_copy_to_list();
346
347        // After flatten: elements should be [A, B, C, B, C] = [1, 2, 3, 2, 3]
348        // Lists should be sequential with no overlaps
349        assert_eq!(flattened.elements().len(), 5);
350
351        // Offsets should be sequential
352        assert_eq!(flattened.offset_at(0), 0);
353        assert_eq!(flattened.size_at(0), 3);
354        assert_eq!(flattened.offset_at(1), 3);
355        assert_eq!(flattened.size_at(1), 2);
356
357        // Verify the data is correct
358        let list0 = flattened.list_elements_at(0).to_primitive();
359        assert_eq!(list0.as_slice::<i32>(), &[1, 2, 3]);
360
361        let list1 = flattened.list_elements_at(1).to_primitive();
362        assert_eq!(list1.as_slice::<i32>(), &[2, 3]);
363    }
364
365    #[test]
366    fn test_rebuild_flatten_with_nullable() {
367        use arrow_buffer::BooleanBuffer;
368
369        use crate::arrays::BoolArray;
370
371        // Create a nullable list view with a null list
372        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
373        let offsets = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
374        let sizes = PrimitiveArray::from_iter(vec![2u32, 1, 1]).into_array();
375        let validity = Validity::Array(
376            BoolArray::from(BooleanBuffer::from(vec![true, false, true])).into_array(),
377        );
378
379        let listview = ListViewArray::try_new(elements, offsets, sizes, validity).unwrap();
380
381        let flattened = listview.rebuild_zero_copy_to_list();
382
383        // Verify nullability is preserved
384        assert_eq!(flattened.dtype().nullability(), Nullability::Nullable);
385        assert!(flattened.validity().is_valid(0));
386        assert!(!flattened.validity().is_valid(1));
387        assert!(flattened.validity().is_valid(2));
388
389        // Verify valid lists contain correct data
390        let list0 = flattened.list_elements_at(0).to_primitive();
391        assert_eq!(list0.as_slice::<i32>(), &[1, 2]);
392
393        let list2 = flattened.list_elements_at(2).to_primitive();
394        assert_eq!(list2.as_slice::<i32>(), &[3]);
395    }
396}