lance_encoding/encodings/logical/
list.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{ops::Range, sync::Arc};
5
6use arrow_array::{cast::AsArray, Array, ArrayRef, LargeListArray, ListArray};
7use arrow_schema::DataType;
8use futures::future::BoxFuture;
9use lance_arrow::deepcopy::deep_copy_nulls;
10use lance_arrow::list::ListArrayExt;
11use lance_core::Result;
12
13use crate::{
14    decoder::{
15        DecodedArray, FilterExpression, ScheduledScanLine, SchedulerContext,
16        StructuralDecodeArrayTask, StructuralFieldDecoder, StructuralFieldScheduler,
17        StructuralSchedulingJob,
18    },
19    encoder::{EncodeTask, FieldEncoder, OutOfLineBuffers},
20    repdef::RepDefBuilder,
21};
22
23/// A structural encoder for list fields
24///
25/// The list's offsets are added to the rep/def builder
26/// and the list array's values are passed to the child encoder
27///
28/// The values will have any garbage values removed and will be trimmed
29/// to only include the values that are actually used.
30pub struct ListStructuralEncoder {
31    keep_original_array: bool,
32    child: Box<dyn FieldEncoder>,
33}
34
35impl ListStructuralEncoder {
36    pub fn new(keep_original_array: bool, child: Box<dyn FieldEncoder>) -> Self {
37        Self {
38            keep_original_array,
39            child,
40        }
41    }
42}
43
44impl FieldEncoder for ListStructuralEncoder {
45    fn maybe_encode(
46        &mut self,
47        array: ArrayRef,
48        external_buffers: &mut OutOfLineBuffers,
49        mut repdef: RepDefBuilder,
50        row_number: u64,
51        num_rows: u64,
52    ) -> Result<Vec<EncodeTask>> {
53        let values = if let Some(list_arr) = array.as_list_opt::<i32>() {
54            let has_garbage_values = if self.keep_original_array {
55                repdef.add_offsets(list_arr.offsets().clone(), array.nulls().cloned())
56            } else {
57                // there is no need to deep copy offsets, because offset buffers will be cast to a common type (i64).
58                repdef.add_offsets(list_arr.offsets().clone(), deep_copy_nulls(array.nulls()))
59            };
60            if has_garbage_values {
61                list_arr.filter_garbage_nulls().trimmed_values()
62            } else {
63                list_arr.trimmed_values()
64            }
65        } else if let Some(list_arr) = array.as_list_opt::<i64>() {
66            let has_garbage_values = if self.keep_original_array {
67                repdef.add_offsets(list_arr.offsets().clone(), array.nulls().cloned())
68            } else {
69                repdef.add_offsets(list_arr.offsets().clone(), deep_copy_nulls(array.nulls()))
70            };
71            if has_garbage_values {
72                list_arr.filter_garbage_nulls().trimmed_values()
73            } else {
74                list_arr.trimmed_values()
75            }
76        } else {
77            panic!("List encoder used for non-list data")
78        };
79        self.child
80            .maybe_encode(values, external_buffers, repdef, row_number, num_rows)
81    }
82
83    fn flush(&mut self, external_buffers: &mut OutOfLineBuffers) -> Result<Vec<EncodeTask>> {
84        self.child.flush(external_buffers)
85    }
86
87    fn num_columns(&self) -> u32 {
88        self.child.num_columns()
89    }
90
91    fn finish(
92        &mut self,
93        external_buffers: &mut OutOfLineBuffers,
94    ) -> BoxFuture<'_, Result<Vec<crate::encoder::EncodedColumn>>> {
95        self.child.finish(external_buffers)
96    }
97}
98
99#[derive(Debug)]
100pub struct StructuralListScheduler {
101    child: Box<dyn StructuralFieldScheduler>,
102}
103
104impl StructuralListScheduler {
105    pub fn new(child: Box<dyn StructuralFieldScheduler>) -> Self {
106        Self { child }
107    }
108}
109
110impl StructuralFieldScheduler for StructuralListScheduler {
111    fn schedule_ranges<'a>(
112        &'a self,
113        ranges: &[Range<u64>],
114        filter: &FilterExpression,
115    ) -> Result<Box<dyn StructuralSchedulingJob + 'a>> {
116        let child = self.child.schedule_ranges(ranges, filter)?;
117
118        Ok(Box::new(StructuralListSchedulingJob::new(child)))
119    }
120
121    fn initialize<'a>(
122        &'a mut self,
123        filter: &'a FilterExpression,
124        context: &'a SchedulerContext,
125    ) -> BoxFuture<'a, Result<()>> {
126        self.child.initialize(filter, context)
127    }
128}
129
130/// Scheduling job for list data
131///
132/// Scheduling is handled by the primitive encoder and nothing special
133/// happens here.
134#[derive(Debug)]
135struct StructuralListSchedulingJob<'a> {
136    child: Box<dyn StructuralSchedulingJob + 'a>,
137}
138
139impl<'a> StructuralListSchedulingJob<'a> {
140    fn new(child: Box<dyn StructuralSchedulingJob + 'a>) -> Self {
141        Self { child }
142    }
143}
144
145impl StructuralSchedulingJob for StructuralListSchedulingJob<'_> {
146    fn schedule_next(
147        &mut self,
148        context: &mut SchedulerContext,
149    ) -> Result<Option<ScheduledScanLine>> {
150        self.child.schedule_next(context)
151    }
152}
153
154#[derive(Debug)]
155pub struct StructuralListDecoder {
156    child: Box<dyn StructuralFieldDecoder>,
157    data_type: DataType,
158}
159
160impl StructuralListDecoder {
161    pub fn new(child: Box<dyn StructuralFieldDecoder>, data_type: DataType) -> Self {
162        Self { child, data_type }
163    }
164}
165
166impl StructuralFieldDecoder for StructuralListDecoder {
167    fn accept_page(&mut self, child: crate::decoder::LoadedPage) -> Result<()> {
168        self.child.accept_page(child)
169    }
170
171    fn drain(&mut self, num_rows: u64) -> Result<Box<dyn StructuralDecodeArrayTask>> {
172        let child_task = self.child.drain(num_rows)?;
173        Ok(Box::new(StructuralListDecodeTask::new(
174            child_task,
175            self.data_type.clone(),
176        )))
177    }
178
179    fn data_type(&self) -> &DataType {
180        &self.data_type
181    }
182}
183
184#[derive(Debug)]
185struct StructuralListDecodeTask {
186    child_task: Box<dyn StructuralDecodeArrayTask>,
187    data_type: DataType,
188}
189
190impl StructuralListDecodeTask {
191    fn new(child_task: Box<dyn StructuralDecodeArrayTask>, data_type: DataType) -> Self {
192        Self {
193            child_task,
194            data_type,
195        }
196    }
197}
198
199impl StructuralDecodeArrayTask for StructuralListDecodeTask {
200    fn decode(self: Box<Self>) -> Result<DecodedArray> {
201        let DecodedArray { array, mut repdef } = self.child_task.decode()?;
202        match &self.data_type {
203            DataType::List(child_field) => {
204                let (offsets, validity) = repdef.unravel_offsets::<i32>()?;
205                let list_array = ListArray::try_new(child_field.clone(), offsets, array, validity)?;
206                Ok(DecodedArray {
207                    array: Arc::new(list_array),
208                    repdef,
209                })
210            }
211            DataType::LargeList(child_field) => {
212                let (offsets, validity) = repdef.unravel_offsets::<i64>()?;
213                let list_array =
214                    LargeListArray::try_new(child_field.clone(), offsets, array, validity)?;
215                Ok(DecodedArray {
216                    array: Arc::new(list_array),
217                    repdef,
218                })
219            }
220            _ => panic!("List decoder did not have a list field"),
221        }
222    }
223}
224
225#[cfg(test)]
226mod tests {
227
228    use std::{collections::HashMap, sync::Arc};
229
230    use arrow::array::{Int64Builder, LargeListBuilder, StringBuilder};
231    use arrow_array::{
232        builder::{Int32Builder, ListBuilder},
233        Array, ArrayRef, BooleanArray, DictionaryArray, LargeStringArray, ListArray, StructArray,
234        UInt64Array, UInt8Array,
235    };
236    use arrow_buffer::{BooleanBuffer, NullBuffer, OffsetBuffer, ScalarBuffer};
237    use arrow_schema::{DataType, Field, Fields};
238    use lance_core::datatypes::{
239        STRUCTURAL_ENCODING_FULLZIP, STRUCTURAL_ENCODING_META_KEY, STRUCTURAL_ENCODING_MINIBLOCK,
240    };
241    use rstest::rstest;
242
243    use crate::{
244        testing::{check_round_trip_encoding_of_data, check_round_trip_encoding_random, TestCases},
245        version::LanceFileVersion,
246    };
247
248    fn make_list_type(inner_type: DataType) -> DataType {
249        DataType::List(Arc::new(Field::new("item", inner_type, true)))
250    }
251
252    fn make_large_list_type(inner_type: DataType) -> DataType {
253        DataType::LargeList(Arc::new(Field::new("item", inner_type, true)))
254    }
255
256    #[rstest]
257    #[test_log::test(tokio::test)]
258    async fn test_list(
259        #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
260        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
261        structural_encoding: &str,
262    ) {
263        let mut field_metadata = HashMap::new();
264        field_metadata.insert(
265            STRUCTURAL_ENCODING_META_KEY.to_string(),
266            structural_encoding.into(),
267        );
268        let field =
269            Field::new("", make_list_type(DataType::Int32), true).with_metadata(field_metadata);
270        check_round_trip_encoding_random(field, version).await;
271    }
272
273    #[rstest]
274    #[test_log::test(tokio::test)]
275    async fn test_deeply_nested_lists(
276        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
277        structural_encoding: &str,
278    ) {
279        let mut field_metadata = HashMap::new();
280        field_metadata.insert(
281            STRUCTURAL_ENCODING_META_KEY.to_string(),
282            structural_encoding.into(),
283        );
284        let field = Field::new("item", DataType::Int32, true).with_metadata(field_metadata);
285        for _ in 0..5 {
286            let field = Field::new("", make_list_type(field.data_type().clone()), true);
287            check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await;
288        }
289    }
290
291    #[test_log::test(tokio::test)]
292    async fn test_large_list() {
293        let field = Field::new("", make_large_list_type(DataType::Int32), true);
294        check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await;
295    }
296
297    #[test_log::test(tokio::test)]
298    async fn test_nested_strings() {
299        let field = Field::new("", make_list_type(DataType::Utf8), true);
300        check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await;
301    }
302
303    #[test_log::test(tokio::test)]
304    async fn test_nested_list() {
305        let field = Field::new("", make_list_type(make_list_type(DataType::Int32)), true);
306        check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await;
307    }
308
309    #[test_log::test(tokio::test)]
310    async fn test_list_struct_list() {
311        let struct_type = DataType::Struct(Fields::from(vec![Field::new(
312            "inner_str",
313            DataType::Utf8,
314            false,
315        )]));
316
317        let field = Field::new("", make_list_type(struct_type), true);
318        check_round_trip_encoding_random(field, LanceFileVersion::V2_0).await;
319    }
320
321    #[test_log::test(tokio::test)]
322    async fn test_list_struct_empty() {
323        let fields = Fields::from(vec![Field::new("inner", DataType::UInt64, true)]);
324        let items = UInt64Array::from(Vec::<u64>::new());
325        let structs = StructArray::new(fields, vec![Arc::new(items)], None);
326        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0; 2 * 1024 * 1024 + 1]));
327        let lists = ListArray::new(
328            Arc::new(Field::new("item", structs.data_type().clone(), true)),
329            offsets,
330            Arc::new(structs),
331            None,
332        );
333
334        check_round_trip_encoding_of_data(
335            vec![Arc::new(lists)],
336            &TestCases::default(),
337            HashMap::new(),
338        )
339        .await;
340    }
341
342    #[rstest]
343    #[test_log::test(tokio::test)]
344    async fn test_simple_list(
345        #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
346        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
347        structural_encoding: &str,
348    ) {
349        let items_builder = Int32Builder::new();
350        let mut list_builder = ListBuilder::new(items_builder);
351        list_builder.append_value([Some(1), Some(2), Some(3)]);
352        list_builder.append_value([Some(4), Some(5)]);
353        list_builder.append_null();
354        list_builder.append_value([Some(6), Some(7), Some(8)]);
355        let list_array = list_builder.finish();
356
357        let mut field_metadata = HashMap::new();
358        field_metadata.insert(
359            STRUCTURAL_ENCODING_META_KEY.to_string(),
360            structural_encoding.into(),
361        );
362
363        let test_cases = TestCases::default()
364            .with_range(0..2)
365            .with_range(0..3)
366            .with_range(1..3)
367            .with_indices(vec![1, 3])
368            .with_indices(vec![2])
369            .with_file_version(version);
370        check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, field_metadata)
371            .await;
372    }
373
374    #[rstest]
375    #[test_log::test(tokio::test)]
376    async fn test_simple_nested_list_ends_with_null(
377        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
378        structural_encoding: &str,
379    ) {
380        use arrow_array::Int32Array;
381
382        let values = Int32Array::from(vec![1, 2, 3, 4, 5]);
383        let inner_offsets = ScalarBuffer::<i32>::from(vec![0, 1, 2, 3, 4, 5, 5]);
384        let inner_validity = BooleanBuffer::from(vec![true, true, true, true, true, false]);
385        let outer_offsets = ScalarBuffer::<i32>::from(vec![0, 1, 2, 3, 4, 5, 6, 6]);
386        let outer_validity = BooleanBuffer::from(vec![true, true, true, true, true, true, false]);
387
388        let inner_list = ListArray::new(
389            Arc::new(Field::new("item", DataType::Int32, true)),
390            OffsetBuffer::new(inner_offsets),
391            Arc::new(values),
392            Some(NullBuffer::new(inner_validity)),
393        );
394        let outer_list = ListArray::new(
395            Arc::new(Field::new(
396                "item",
397                DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
398                true,
399            )),
400            OffsetBuffer::new(outer_offsets),
401            Arc::new(inner_list),
402            Some(NullBuffer::new(outer_validity)),
403        );
404
405        let mut field_metadata = HashMap::new();
406        field_metadata.insert(
407            STRUCTURAL_ENCODING_META_KEY.to_string(),
408            structural_encoding.into(),
409        );
410
411        let test_cases = TestCases::default()
412            .with_range(0..2)
413            .with_range(0..3)
414            .with_range(5..7)
415            .with_indices(vec![1, 6])
416            .with_indices(vec![6])
417            .with_file_version(LanceFileVersion::V2_1);
418        check_round_trip_encoding_of_data(vec![Arc::new(outer_list)], &test_cases, field_metadata)
419            .await;
420    }
421
422    #[rstest]
423    #[test_log::test(tokio::test)]
424    async fn test_simple_string_list(
425        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
426        structural_encoding: &str,
427    ) {
428        let items_builder = StringBuilder::new();
429        let mut list_builder = ListBuilder::new(items_builder);
430        list_builder.append_value([Some("a"), Some("bc"), Some("def")]);
431        list_builder.append_value([Some("gh"), None]);
432        list_builder.append_null();
433        list_builder.append_value([Some("ijk"), Some("lmnop"), Some("qrs")]);
434        let list_array = list_builder.finish();
435
436        let mut field_metadata = HashMap::new();
437        field_metadata.insert(
438            STRUCTURAL_ENCODING_META_KEY.to_string(),
439            structural_encoding.into(),
440        );
441
442        let test_cases = TestCases::default()
443            .with_range(0..2)
444            .with_range(0..3)
445            .with_range(1..3)
446            .with_indices(vec![1, 3])
447            .with_indices(vec![2])
448            .with_file_version(LanceFileVersion::V2_1);
449        check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, field_metadata)
450            .await;
451    }
452
453    #[rstest]
454    #[test_log::test(tokio::test)]
455    async fn test_simple_sliced_list(
456        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
457        structural_encoding: &str,
458    ) {
459        let items_builder = Int32Builder::new();
460        let mut list_builder = ListBuilder::new(items_builder);
461        list_builder.append_value([Some(1), Some(2), Some(3)]);
462        list_builder.append_value([Some(4), Some(5)]);
463        list_builder.append_null();
464        list_builder.append_value([Some(6), Some(7), Some(8)]);
465        let list_array = list_builder.finish();
466
467        let list_array = list_array.slice(1, 2);
468
469        let mut field_metadata = HashMap::new();
470        field_metadata.insert(
471            STRUCTURAL_ENCODING_META_KEY.to_string(),
472            structural_encoding.into(),
473        );
474
475        let test_cases = TestCases::default()
476            .with_range(0..2)
477            .with_range(1..2)
478            .with_indices(vec![0])
479            .with_indices(vec![1])
480            .with_file_version(LanceFileVersion::V2_1);
481        check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, field_metadata)
482            .await;
483    }
484
485    #[test_log::test(tokio::test)]
486    async fn test_simple_list_dict() {
487        let values = LargeStringArray::from_iter_values(["a", "bb", "ccc"]);
488        let indices = UInt8Array::from(vec![0, 1, 2, 0, 1, 2, 0, 1, 2]);
489        let dict_array = DictionaryArray::new(indices, Arc::new(values));
490        let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 6, 9]));
491        let list_array = ListArray::new(
492            Arc::new(Field::new("item", dict_array.data_type().clone(), true)),
493            offsets,
494            Arc::new(dict_array),
495            None,
496        );
497
498        let test_cases = TestCases::default()
499            .with_range(0..2)
500            .with_range(1..3)
501            .with_range(2..4)
502            .with_indices(vec![1])
503            .with_indices(vec![2]);
504        check_round_trip_encoding_of_data(
505            vec![Arc::new(list_array)],
506            &test_cases,
507            HashMap::default(),
508        )
509        .await;
510    }
511
512    #[rstest]
513    #[test_log::test(tokio::test)]
514    async fn test_list_with_garbage_nulls(
515        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
516        structural_encoding: &str,
517    ) {
518        // In Arrow, list nulls are allowed to be non-empty, with masked garbage values
519        // Here we make a list with a null row in the middle with 3 garbage values
520        let items = UInt64Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
521        let offsets = ScalarBuffer::<i32>::from(vec![0, 5, 8, 10]);
522        let offsets = OffsetBuffer::new(offsets);
523        let list_validity = NullBuffer::new(BooleanBuffer::from(vec![true, false, true]));
524        let list_arr = ListArray::new(
525            Arc::new(Field::new("item", DataType::UInt64, true)),
526            offsets,
527            Arc::new(items),
528            Some(list_validity),
529        );
530
531        let mut field_metadata = HashMap::new();
532        field_metadata.insert(
533            STRUCTURAL_ENCODING_META_KEY.to_string(),
534            structural_encoding.into(),
535        );
536
537        let test_cases = TestCases::default()
538            .with_range(0..3)
539            .with_range(1..2)
540            .with_indices(vec![1])
541            .with_indices(vec![2])
542            .with_file_version(LanceFileVersion::V2_1);
543        check_round_trip_encoding_of_data(vec![Arc::new(list_arr)], &test_cases, field_metadata)
544            .await;
545    }
546
547    #[rstest]
548    #[test_log::test(tokio::test)]
549    async fn test_simple_two_page_list(
550        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
551        structural_encoding: &str,
552    ) {
553        // This is a simple pre-defined list that spans two pages.  This test is useful for
554        // debugging the repetition index
555        let items_builder = Int64Builder::new();
556        let mut list_builder = ListBuilder::new(items_builder);
557        for i in 0..512 {
558            list_builder.append_value([Some(i), Some(i * 2)]);
559        }
560        let list_array_1 = list_builder.finish();
561
562        let items_builder = Int64Builder::new();
563        let mut list_builder = ListBuilder::new(items_builder);
564        for i in 0..512 {
565            let i = i + 512;
566            list_builder.append_value([Some(i), Some(i * 2)]);
567        }
568        let list_array_2 = list_builder.finish();
569
570        let mut metadata = HashMap::new();
571        metadata.insert(
572            STRUCTURAL_ENCODING_META_KEY.to_string(),
573            structural_encoding.into(),
574        );
575
576        let test_cases = TestCases::default()
577            .with_file_version(LanceFileVersion::V2_1)
578            .with_page_sizes(vec![100])
579            .with_range(800..900);
580        check_round_trip_encoding_of_data(
581            vec![Arc::new(list_array_1), Arc::new(list_array_2)],
582            &test_cases,
583            metadata,
584        )
585        .await;
586    }
587
588    #[test_log::test(tokio::test)]
589    async fn test_simple_large_list() {
590        let items_builder = Int32Builder::new();
591        let mut list_builder = LargeListBuilder::new(items_builder);
592        list_builder.append_value([Some(1), Some(2), Some(3)]);
593        list_builder.append_value([Some(4), Some(5)]);
594        list_builder.append_null();
595        list_builder.append_value([Some(6), Some(7), Some(8)]);
596        let list_array = list_builder.finish();
597
598        let test_cases = TestCases::default()
599            .with_range(0..2)
600            .with_range(0..3)
601            .with_range(1..3)
602            .with_indices(vec![1, 3]);
603        check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, HashMap::new())
604            .await;
605    }
606
607    #[rstest]
608    #[test_log::test(tokio::test)]
609    async fn test_empty_lists(
610        #[values(LanceFileVersion::V2_0, LanceFileVersion::V2_1)] version: LanceFileVersion,
611        #[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
612        structural_encoding: &str,
613    ) {
614        let mut field_metadata = HashMap::new();
615        field_metadata.insert(
616            STRUCTURAL_ENCODING_META_KEY.to_string(),
617            structural_encoding.into(),
618        );
619
620        // Scenario 1: Some lists are empty
621
622        let values = [vec![Some(1), Some(2), Some(3)], vec![], vec![None]];
623        // Test empty list at beginning, middle, and end
624        for order in [[0, 1, 2], [1, 0, 2], [2, 0, 1]] {
625            let items_builder = Int32Builder::new();
626            let mut list_builder = ListBuilder::new(items_builder);
627            for idx in order {
628                list_builder.append_value(values[idx].clone());
629            }
630            let list_array = Arc::new(list_builder.finish());
631            let test_cases = TestCases::default()
632                .with_indices(vec![1])
633                .with_indices(vec![0])
634                .with_indices(vec![2])
635                .with_indices(vec![0, 1])
636                .with_file_version(version);
637            check_round_trip_encoding_of_data(
638                vec![list_array.clone()],
639                &test_cases,
640                field_metadata.clone(),
641            )
642            .await;
643            let test_cases = test_cases.with_batch_size(1);
644            check_round_trip_encoding_of_data(
645                vec![list_array],
646                &test_cases,
647                field_metadata.clone(),
648            )
649            .await;
650        }
651
652        // Scenario 2: All lists are empty
653
654        // When encoding a list of empty lists there are no items to encode
655        // which is strange and we want to ensure we handle it
656        let items_builder = Int32Builder::new();
657        let mut list_builder = ListBuilder::new(items_builder);
658        list_builder.append(true);
659        list_builder.append_null();
660        list_builder.append(true);
661        let list_array = Arc::new(list_builder.finish());
662
663        let test_cases = TestCases::default()
664            .with_range(0..2)
665            .with_indices(vec![1])
666            .with_file_version(version);
667        check_round_trip_encoding_of_data(
668            vec![list_array.clone()],
669            &test_cases,
670            field_metadata.clone(),
671        )
672        .await;
673        let test_cases = test_cases.with_batch_size(1);
674        check_round_trip_encoding_of_data(vec![list_array], &test_cases, field_metadata.clone())
675            .await;
676
677        // Scenario 2B: All lists are empty (but now with strings)
678
679        // When encoding a list of empty lists there are no items to encode
680        // which is strange and we want to ensure we handle it
681        let items_builder = StringBuilder::new();
682        let mut list_builder = ListBuilder::new(items_builder);
683        list_builder.append(true);
684        list_builder.append_null();
685        list_builder.append(true);
686        let list_array = Arc::new(list_builder.finish());
687
688        let test_cases = TestCases::default()
689            .with_range(0..2)
690            .with_indices(vec![1])
691            .with_file_version(version);
692        check_round_trip_encoding_of_data(
693            vec![list_array.clone()],
694            &test_cases,
695            field_metadata.clone(),
696        )
697        .await;
698        let test_cases = test_cases.with_batch_size(1);
699        check_round_trip_encoding_of_data(vec![list_array], &test_cases, field_metadata.clone())
700            .await;
701
702        // Scenario 3: All lists are null
703
704        let items_builder = Int32Builder::new();
705        let mut list_builder = ListBuilder::new(items_builder);
706        list_builder.append_null();
707        list_builder.append_null();
708        list_builder.append_null();
709        let list_array = Arc::new(list_builder.finish());
710
711        let test_cases = TestCases::default()
712            .with_range(0..2)
713            .with_indices(vec![1])
714            .with_file_version(version);
715        check_round_trip_encoding_of_data(
716            vec![list_array.clone()],
717            &test_cases,
718            field_metadata.clone(),
719        )
720        .await;
721        let test_cases = test_cases.with_batch_size(1);
722        check_round_trip_encoding_of_data(vec![list_array], &test_cases, field_metadata.clone())
723            .await;
724
725        if version < LanceFileVersion::V2_1 {
726            return;
727        }
728
729        // Scenario 4: All lists are null and inside a struct (only valid for 2.1 since 2.0 doesn't
730        // support null structs)
731        let items_builder = Int32Builder::new();
732        let mut list_builder = ListBuilder::new(items_builder);
733        list_builder.append_null();
734        list_builder.append_null();
735        list_builder.append_null();
736        let list_array = Arc::new(list_builder.finish());
737
738        let struct_validity = NullBuffer::new(BooleanBuffer::from(vec![true, false, true]));
739        let struct_array = Arc::new(StructArray::new(
740            Fields::from(vec![Field::new(
741                "lists",
742                list_array.data_type().clone(),
743                true,
744            )]),
745            vec![list_array],
746            Some(struct_validity),
747        ));
748
749        let test_cases = TestCases::default()
750            .with_range(0..2)
751            .with_indices(vec![1])
752            .with_file_version(version);
753        check_round_trip_encoding_of_data(
754            vec![struct_array.clone()],
755            &test_cases,
756            field_metadata.clone(),
757        )
758        .await;
759        let test_cases = test_cases.with_batch_size(1);
760        check_round_trip_encoding_of_data(vec![struct_array], &test_cases, field_metadata.clone())
761            .await;
762    }
763
764    #[test_log::test(tokio::test)]
765    #[ignore] // This test is quite slow in debug mode
766    async fn test_jumbo_list() {
767        // This is an overflow test.  We have a list of lists where each list
768        // has 1Mi items.  We encode 5000 of these lists and so we have over 4Gi in the
769        // offsets range
770        let items = BooleanArray::new_null(1024 * 1024);
771        let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 1024 * 1024]));
772        let list_arr = Arc::new(ListArray::new(
773            Arc::new(Field::new("item", DataType::Boolean, true)),
774            offsets,
775            Arc::new(items),
776            None,
777        )) as ArrayRef;
778        let arrs = vec![list_arr; 5000];
779
780        // We can't validate because our validation relies on concatenating all input arrays
781        let test_cases = TestCases::default().without_validation();
782        check_round_trip_encoding_of_data(arrs, &test_cases, HashMap::new()).await;
783    }
784}