vortex_array/arrays/listview/
rebuild.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use num_traits::FromPrimitive;
5use vortex_dtype::{IntegerPType, Nullability, match_each_integer_ptype};
6use vortex_error::VortexExpect;
7use vortex_scalar::Scalar;
8
9use crate::arrays::ListViewArray;
10use crate::builders::{ArrayBuilder, ListViewBuilder};
11use crate::vtable::ValidityHelper;
12use crate::{Array, compute};
13
14/// Modes for rebuilding a [`ListViewArray`].
15pub enum ListViewRebuildMode {
16    /// Removes all unused data and flattens out all list data, such that the array is zero-copyable
17    /// to a [`ListArray`].
18    ///
19    /// This mode will deduplicate all overlapping list views, such that the [`ListViewArray`] looks
20    /// like a [`ListArray`] but with an additional `sizes` array.
21    ///
22    /// [`ListArray`]: crate::arrays::ListArray
23    MakeZeroCopyToList,
24
25    /// Removes any leading or trailing elements that are unused / not referenced by any views in
26    /// the [`ListViewArray`].
27    TrimElements,
28
29    /// Equivalent to `MakeZeroCopyToList` plus `TrimElements`.
30    ///
31    /// This is useful when concatenating multiple [`ListViewArray`]s together to create a new
32    /// [`ListViewArray`] that is also zero-copy to a [`ListArray`].
33    ///
34    /// [`ListArray`]: crate::arrays::ListArray
35    MakeExact,
36
37    // TODO(connor)[ListView]: Implement some version of this.
38    /// Finds the shortest packing / overlapping of list elements.
39    ///
40    /// This problem is known to be NP-hard, so maybe when someone proves that P=NP we can implement
41    /// this algorithm (but in all seriousness there are many approximate algorithms that could
42    /// work well here).
43    OverlapCompression,
44}
45
46impl ListViewArray {
47    /// Rebuilds the [`ListViewArray`] according to the specified mode.
48    pub fn rebuild(&self, mode: ListViewRebuildMode) -> ListViewArray {
49        if self.is_empty() {
50            return self.clone();
51        }
52
53        match mode {
54            ListViewRebuildMode::MakeZeroCopyToList => self.rebuild_zero_copy_to_list(),
55            ListViewRebuildMode::TrimElements => self.rebuild_trim_elements(),
56            ListViewRebuildMode::MakeExact => self.rebuild_make_exact(),
57            ListViewRebuildMode::OverlapCompression => unimplemented!("Does P=NP?"),
58        }
59    }
60
61    /// Rebuilds a [`ListViewArray`], removing all data overlaps and creating a flattened layout.
62    ///
63    /// This is useful when the `elements` child array of the [`ListViewArray`] might have
64    /// overlapping, duplicate, and garbage data, and we want to have fully sequential data like
65    /// a [`ListArray`].
66    ///
67    /// [`ListArray`]: crate::arrays::ListArray
68    fn rebuild_zero_copy_to_list(&self) -> ListViewArray {
69        if self.is_zero_copy_to_list() {
70            // Note that since everything in `ListViewArray` is `Arc`ed, this is quite cheap.
71            return self.clone();
72        }
73
74        let offsets_ptype = self.offsets().dtype().as_ptype();
75        let sizes_ptype = self.sizes().dtype().as_ptype();
76
77        match_each_integer_ptype!(offsets_ptype, |O| {
78            match_each_integer_ptype!(sizes_ptype, |S| { self.naive_rebuild::<O, S>() })
79        })
80    }
81
82    /// The inner function for `rebuild_zero_copy_to_list`, which naively rebuilds a `ListViewArray`
83    /// via `append_scalar`.
84    fn naive_rebuild<O: IntegerPType, S: IntegerPType>(&self) -> ListViewArray {
85        let element_dtype = self
86            .dtype()
87            .as_list_element_opt()
88            .vortex_expect("somehow had a canonical list that was not a list");
89
90        let mut builder = ListViewBuilder::<O, S>::with_capacity(
91            element_dtype.clone(),
92            self.dtype().nullability(),
93            self.elements().len(),
94            self.len(),
95        );
96
97        for i in 0..self.len() {
98            let list = self.scalar_at(i);
99
100            builder
101                .append_scalar(&list)
102                .vortex_expect("was unable to extend the `ListViewBuilder`")
103        }
104
105        builder.finish_into_listview()
106    }
107
108    /// Rebuilds a [`ListViewArray`] by trimming any unused / unreferenced leading and trailing
109    /// elements, which is defined as a contiguous run of values in the `elements` array that are
110    /// not referecened by any views in the corresponding [`ListViewArray`].
111    fn rebuild_trim_elements(&self) -> ListViewArray {
112        let start = if self.is_zero_copy_to_list() {
113            // If offsets are sorted, then the minimum offset is the first offset.
114            // Note that even if the first view is null, offsets must always be valid, so it is
115            // completely fine for us to use this as a lower-bounded start of the `elements`.
116            self.offset_at(0)
117        } else {
118            self.offsets().statistics().compute_min().vortex_expect(
119                "[ListViewArray::rebuild]: `offsets` must report min statistic that is a `usize`",
120            )
121        };
122
123        let end = if self.is_zero_copy_to_list() {
124            // If offsets are sorted and there are no overlaps (views are always "increasing"), we
125            // can just grab the last offset and last size.
126            let last_offset = self.offset_at(self.len() - 1);
127            let last_size = self.size_at(self.len() - 1);
128            last_offset + last_size
129        } else {
130            let min_max = compute::min_max(
131                &compute::add(self.offsets(), self.sizes())
132                    .vortex_expect("`offsets + sizes` somehow overflowed"),
133            )
134            .vortex_expect("Something went wrong while computing min and max")
135            .vortex_expect("We checked that the array was not empty in the top-level `rebuild`");
136
137            min_max
138                .max
139                .as_primitive()
140                .as_::<usize>()
141                .vortex_expect("unable to interpret the max `offset + size` as a `usize`")
142        };
143
144        let adjusted_offsets = match_each_integer_ptype!(self.offsets().dtype().as_ptype(), |O| {
145            let offset = <O as FromPrimitive>::from_usize(start)
146                .vortex_expect("unable to convert the min offset `start` into a `usize`");
147            let scalar = Scalar::primitive(offset, Nullability::NonNullable);
148
149            compute::sub_scalar(self.offsets(), scalar)
150                .vortex_expect("was somehow unable to adjust offsets down by their minimum")
151        });
152
153        let sliced_elements = self.elements().slice(start..end);
154
155        // SAFETY: The only thing we changed was the elements (which we verify through mins and
156        // maxes that all adjusted offsets + sizes are within the correct bounds), so the parameters
157        // are valid. And if the original array was zero-copyable to list, trimming elements doesn't
158        // change that property.
159        unsafe {
160            ListViewArray::new_unchecked(
161                sliced_elements,
162                adjusted_offsets,
163                self.sizes().clone(),
164                self.validity().clone(),
165            )
166            .with_zero_copy_to_list(self.is_zero_copy_to_list())
167        }
168    }
169
170    fn rebuild_make_exact(&self) -> ListViewArray {
171        if self.is_zero_copy_to_list() {
172            self.rebuild_trim_elements()
173        } else {
174            // When we completely rebuild the `ListViewArray`, we get the benefit that we also trim
175            // any leading and trailing garbage data.
176            self.rebuild_zero_copy_to_list()
177        }
178    }
179}
180
181#[cfg(test)]
182mod tests {
183    use vortex_buffer::BitBuffer;
184    use vortex_dtype::Nullability;
185
186    use super::ListViewRebuildMode;
187    use crate::arrays::{ListViewArray, PrimitiveArray};
188    use crate::validity::Validity;
189    use crate::vtable::ValidityHelper;
190    use crate::{IntoArray, ToCanonical};
191
192    #[test]
193    fn test_rebuild_flatten_removes_overlaps() {
194        // Create a list view with overlapping lists: [A, B, C]
195        // List 0: offset=0, size=3 -> [A, B, C]
196        // List 1: offset=1, size=2 -> [B, C] (overlaps with List 0)
197        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
198        let offsets = PrimitiveArray::from_iter(vec![0u32, 1]).into_array();
199        let sizes = PrimitiveArray::from_iter(vec![3u32, 2]).into_array();
200
201        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
202
203        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList);
204
205        // After flatten: elements should be [A, B, C, B, C] = [1, 2, 3, 2, 3]
206        // Lists should be sequential with no overlaps
207        assert_eq!(flattened.elements().len(), 5);
208
209        // Offsets should be sequential
210        assert_eq!(flattened.offset_at(0), 0);
211        assert_eq!(flattened.size_at(0), 3);
212        assert_eq!(flattened.offset_at(1), 3);
213        assert_eq!(flattened.size_at(1), 2);
214
215        // Verify the data is correct
216        let list0 = flattened.list_elements_at(0).to_primitive();
217        assert_eq!(list0.as_slice::<i32>(), &[1, 2, 3]);
218
219        let list1 = flattened.list_elements_at(1).to_primitive();
220        assert_eq!(list1.as_slice::<i32>(), &[2, 3]);
221    }
222
223    #[test]
224    fn test_rebuild_flatten_with_nullable() {
225        use crate::arrays::BoolArray;
226
227        // Create a nullable list view with a null list
228        let elements = PrimitiveArray::from_iter(vec![1i32, 2, 3]).into_array();
229        let offsets = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
230        let sizes = PrimitiveArray::from_iter(vec![2u32, 1, 1]).into_array();
231        let validity = Validity::Array(
232            BoolArray::from_bit_buffer(
233                BitBuffer::from(vec![true, false, true]),
234                Validity::NonNullable,
235            )
236            .into_array(),
237        );
238
239        let listview = ListViewArray::new(elements, offsets, sizes, validity);
240
241        let flattened = listview.rebuild(ListViewRebuildMode::MakeZeroCopyToList);
242
243        // Verify nullability is preserved
244        assert_eq!(flattened.dtype().nullability(), Nullability::Nullable);
245        assert!(flattened.validity().is_valid(0));
246        assert!(!flattened.validity().is_valid(1));
247        assert!(flattened.validity().is_valid(2));
248
249        // Verify valid lists contain correct data
250        let list0 = flattened.list_elements_at(0).to_primitive();
251        assert_eq!(list0.as_slice::<i32>(), &[1, 2]);
252
253        let list2 = flattened.list_elements_at(2).to_primitive();
254        assert_eq!(list2.as_slice::<i32>(), &[3]);
255    }
256
257    #[test]
258    fn test_rebuild_trim_elements_basic() {
259        // Test trimming both leading and trailing unused elements while preserving gaps in the
260        // middle.
261        // Elements: [_, _, A, B, _, C, D, _, _]
262        //            0  1  2  3  4  5  6  7  8
263        // List 0: offset=2, size=2 -> [A, B]
264        // List 1: offset=5, size=2 -> [C, D]
265        // Should trim to: [A, B, _, C, D] with adjusted offsets.
266        let elements =
267            PrimitiveArray::from_iter(vec![99i32, 98, 1, 2, 97, 3, 4, 96, 95]).into_array();
268        let offsets = PrimitiveArray::from_iter(vec![2u32, 5]).into_array();
269        let sizes = PrimitiveArray::from_iter(vec![2u32, 2]).into_array();
270
271        let listview = ListViewArray::new(elements, offsets, sizes, Validity::NonNullable);
272
273        let trimmed = listview.rebuild(ListViewRebuildMode::TrimElements);
274
275        // After trimming: elements should be [A, B, _, C, D] = [1, 2, 97, 3, 4].
276        assert_eq!(trimmed.elements().len(), 5);
277
278        // Offsets should be adjusted: old offset 2 -> new offset 0, old offset 5 -> new offset 3.
279        assert_eq!(trimmed.offset_at(0), 0);
280        assert_eq!(trimmed.size_at(0), 2);
281        assert_eq!(trimmed.offset_at(1), 3);
282        assert_eq!(trimmed.size_at(1), 2);
283
284        // Verify the data is correct.
285        let list0 = trimmed.list_elements_at(0).to_primitive();
286        assert_eq!(list0.as_slice::<i32>(), &[1, 2]);
287
288        let list1 = trimmed.list_elements_at(1).to_primitive();
289        assert_eq!(list1.as_slice::<i32>(), &[3, 4]);
290
291        // Note that element at index 2 (97) is preserved as a gap.
292        let all_elements = trimmed.elements().to_primitive();
293        assert_eq!(all_elements.scalar_at(2), 97i32.into());
294    }
295}