use std::{ops::Range, sync::Arc};
use arrow_array::{Array, ArrayRef, LargeListArray, ListArray, cast::AsArray, make_array};
use arrow_schema::DataType;
use futures::future::BoxFuture;
use lance_arrow::deepcopy::deep_copy_nulls;
use lance_arrow::list::ListArrayExt;
use lance_core::Result;
use crate::{
decoder::{
DecodedArray, FilterExpression, ScheduledScanLine, SchedulerContext,
StructuralDecodeArrayTask, StructuralFieldDecoder, StructuralFieldScheduler,
StructuralSchedulingJob,
},
encoder::{EncodeTask, FieldEncoder, OutOfLineBuffers},
repdef::RepDefBuilder,
};
pub struct ListStructuralEncoder {
keep_original_array: bool,
child: Box<dyn FieldEncoder>,
}
impl ListStructuralEncoder {
pub fn new(keep_original_array: bool, child: Box<dyn FieldEncoder>) -> Self {
Self {
keep_original_array,
child,
}
}
}
impl FieldEncoder for ListStructuralEncoder {
fn maybe_encode(
&mut self,
array: ArrayRef,
external_buffers: &mut OutOfLineBuffers,
mut repdef: RepDefBuilder,
row_number: u64,
num_rows: u64,
) -> Result<Vec<EncodeTask>> {
let values = if let Some(list_arr) = array.as_list_opt::<i32>() {
let has_garbage_values = if self.keep_original_array {
repdef.add_offsets(list_arr.offsets().clone(), array.nulls().cloned())
} else {
repdef.add_offsets(list_arr.offsets().clone(), deep_copy_nulls(array.nulls()))
};
if has_garbage_values {
list_arr.filter_garbage_nulls().trimmed_values()
} else {
list_arr.trimmed_values()
}
} else if let Some(list_arr) = array.as_list_opt::<i64>() {
let has_garbage_values = if self.keep_original_array {
repdef.add_offsets(list_arr.offsets().clone(), array.nulls().cloned())
} else {
repdef.add_offsets(list_arr.offsets().clone(), deep_copy_nulls(array.nulls()))
};
if has_garbage_values {
list_arr.filter_garbage_nulls().trimmed_values()
} else {
list_arr.trimmed_values()
}
} else {
panic!("List encoder used for non-list data")
};
self.child
.maybe_encode(values, external_buffers, repdef, row_number, num_rows)
}
fn flush(&mut self, external_buffers: &mut OutOfLineBuffers) -> Result<Vec<EncodeTask>> {
self.child.flush(external_buffers)
}
fn num_columns(&self) -> u32 {
self.child.num_columns()
}
fn finish(
&mut self,
external_buffers: &mut OutOfLineBuffers,
) -> BoxFuture<'_, Result<Vec<crate::encoder::EncodedColumn>>> {
self.child.finish(external_buffers)
}
}
#[derive(Debug)]
pub struct StructuralListScheduler {
child: Box<dyn StructuralFieldScheduler>,
}
impl StructuralListScheduler {
pub fn new(child: Box<dyn StructuralFieldScheduler>) -> Self {
Self { child }
}
}
impl StructuralFieldScheduler for StructuralListScheduler {
fn schedule_ranges<'a>(
&'a self,
ranges: &[Range<u64>],
filter: &FilterExpression,
) -> Result<Box<dyn StructuralSchedulingJob + 'a>> {
let child = self.child.schedule_ranges(ranges, filter)?;
Ok(Box::new(StructuralListSchedulingJob::new(child)))
}
fn initialize<'a>(
&'a mut self,
filter: &'a FilterExpression,
context: &'a SchedulerContext,
) -> BoxFuture<'a, Result<()>> {
self.child.initialize(filter, context)
}
}
#[derive(Debug)]
struct StructuralListSchedulingJob<'a> {
child: Box<dyn StructuralSchedulingJob + 'a>,
}
impl<'a> StructuralListSchedulingJob<'a> {
fn new(child: Box<dyn StructuralSchedulingJob + 'a>) -> Self {
Self { child }
}
}
impl StructuralSchedulingJob for StructuralListSchedulingJob<'_> {
fn schedule_next(&mut self, context: &mut SchedulerContext) -> Result<Vec<ScheduledScanLine>> {
self.child.schedule_next(context)
}
}
#[derive(Debug)]
pub struct StructuralListDecoder {
child: Box<dyn StructuralFieldDecoder>,
data_type: DataType,
}
impl StructuralListDecoder {
pub fn new(child: Box<dyn StructuralFieldDecoder>, data_type: DataType) -> Self {
Self { child, data_type }
}
}
impl StructuralFieldDecoder for StructuralListDecoder {
fn accept_page(&mut self, child: crate::decoder::LoadedPageShard) -> Result<()> {
self.child.accept_page(child)
}
fn drain(&mut self, num_rows: u64) -> Result<Box<dyn StructuralDecodeArrayTask>> {
let child_task = self.child.drain(num_rows)?;
Ok(Box::new(StructuralListDecodeTask::new(
child_task,
self.data_type.clone(),
)))
}
fn data_type(&self) -> &DataType {
&self.data_type
}
}
#[derive(Debug)]
struct StructuralListDecodeTask {
child_task: Box<dyn StructuralDecodeArrayTask>,
data_type: DataType,
}
impl StructuralListDecodeTask {
fn new(child_task: Box<dyn StructuralDecodeArrayTask>, data_type: DataType) -> Self {
Self {
child_task,
data_type,
}
}
}
impl StructuralDecodeArrayTask for StructuralListDecodeTask {
fn decode(self: Box<Self>) -> Result<DecodedArray> {
let DecodedArray { array, mut repdef } = self.child_task.decode()?;
match &self.data_type {
DataType::List(child_field) => {
let (offsets, validity) = repdef.unravel_offsets::<i32>()?;
let array = if !child_field.is_nullable() && array.null_count() == array.len() {
make_array(array.into_data().into_builder().nulls(None).build()?)
} else {
array
};
let list_array = ListArray::try_new(child_field.clone(), offsets, array, validity)?;
Ok(DecodedArray {
array: Arc::new(list_array),
repdef,
})
}
DataType::LargeList(child_field) => {
let (offsets, validity) = repdef.unravel_offsets::<i64>()?;
let list_array =
LargeListArray::try_new(child_field.clone(), offsets, array, validity)?;
Ok(DecodedArray {
array: Arc::new(list_array),
repdef,
})
}
_ => panic!("List decoder did not have a list field"),
}
}
}
#[cfg(test)]
mod tests {
use std::{collections::HashMap, sync::Arc};
use crate::constants::{
STRUCTURAL_ENCODING_FULLZIP, STRUCTURAL_ENCODING_META_KEY, STRUCTURAL_ENCODING_MINIBLOCK,
};
use arrow_array::{
Array, ArrayRef, BooleanArray, DictionaryArray, LargeStringArray, ListArray, StructArray,
UInt8Array, UInt64Array,
builder::{Int32Builder, Int64Builder, LargeListBuilder, ListBuilder, StringBuilder},
};
use arrow_buffer::{BooleanBuffer, NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow_schema::{DataType, Field, Fields};
use rstest::rstest;
use crate::{
testing::{TestCases, check_basic_random, check_round_trip_encoding_of_data},
version::LanceFileVersion,
};
fn make_list_type(inner_type: DataType) -> DataType {
DataType::List(Arc::new(Field::new("item", inner_type, true)))
}
fn make_large_list_type(inner_type: DataType) -> DataType {
DataType::LargeList(Arc::new(Field::new("item", inner_type, true)))
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_list(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let field =
Field::new("", make_list_type(DataType::Int32), true).with_metadata(field_metadata);
check_basic_random(field).await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_deeply_nested_lists(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let field = Field::new("item", DataType::Int32, true).with_metadata(field_metadata);
for _ in 0..5 {
let field = Field::new("", make_list_type(field.data_type().clone()), true);
check_basic_random(field).await;
}
}
#[test_log::test(tokio::test)]
async fn test_large_list() {
let field = Field::new("", make_large_list_type(DataType::Int32), true);
check_basic_random(field).await;
}
#[test_log::test(tokio::test)]
async fn test_nested_strings() {
let field = Field::new("", make_list_type(DataType::Utf8), true);
check_basic_random(field).await;
}
#[test_log::test(tokio::test)]
async fn test_nested_list() {
let field = Field::new("", make_list_type(make_list_type(DataType::Int32)), true);
check_basic_random(field).await;
}
#[test_log::test(tokio::test)]
async fn test_list_struct_list() {
let struct_type = DataType::Struct(Fields::from(vec![Field::new(
"inner_str",
DataType::Utf8,
false,
)]));
let field = Field::new("", make_list_type(struct_type), true);
check_basic_random(field).await;
}
#[test_log::test(tokio::test)]
async fn test_list_struct_empty() {
let fields = Fields::from(vec![Field::new("inner", DataType::UInt64, true)]);
let items = UInt64Array::from(Vec::<u64>::new());
let structs = StructArray::new(fields, vec![Arc::new(items)], None);
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0; 2 * 1024 * 1024 + 1]));
let lists = ListArray::new(
Arc::new(Field::new("item", structs.data_type().clone(), true)),
offsets,
Arc::new(structs),
None,
);
check_round_trip_encoding_of_data(
vec![Arc::new(lists)],
&TestCases::default(),
HashMap::new(),
)
.await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_simple_list(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let items_builder = Int32Builder::new();
let mut list_builder = ListBuilder::new(items_builder);
list_builder.append_value([Some(1), Some(2), Some(3)]);
list_builder.append_value([Some(4), Some(5)]);
list_builder.append_null();
list_builder.append_value([Some(6), Some(7), Some(8)]);
let list_array = list_builder.finish();
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let test_cases = TestCases::default()
.with_range(0..2)
.with_range(0..3)
.with_range(1..3)
.with_indices(vec![1, 3])
.with_indices(vec![2]);
check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, field_metadata)
.await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_simple_nested_list_ends_with_null(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
use arrow_array::Int32Array;
let values = Int32Array::from(vec![1, 2, 3, 4, 5]);
let inner_offsets = ScalarBuffer::<i32>::from(vec![0, 1, 2, 3, 4, 5, 5]);
let inner_validity = BooleanBuffer::from(vec![true, true, true, true, true, false]);
let outer_offsets = ScalarBuffer::<i32>::from(vec![0, 1, 2, 3, 4, 5, 6, 6]);
let outer_validity = BooleanBuffer::from(vec![true, true, true, true, true, true, false]);
let inner_list = ListArray::new(
Arc::new(Field::new("item", DataType::Int32, true)),
OffsetBuffer::new(inner_offsets),
Arc::new(values),
Some(NullBuffer::new(inner_validity)),
);
let outer_list = ListArray::new(
Arc::new(Field::new(
"item",
DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
true,
)),
OffsetBuffer::new(outer_offsets),
Arc::new(inner_list),
Some(NullBuffer::new(outer_validity)),
);
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let test_cases = TestCases::default()
.with_range(0..2)
.with_range(0..3)
.with_range(5..7)
.with_indices(vec![1, 6])
.with_indices(vec![6])
.with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(vec![Arc::new(outer_list)], &test_cases, field_metadata)
.await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_simple_string_list(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let items_builder = StringBuilder::new();
let mut list_builder = ListBuilder::new(items_builder);
list_builder.append_value([Some("a"), Some("bc"), Some("def")]);
list_builder.append_value([Some("gh"), None]);
list_builder.append_null();
list_builder.append_value([Some("ijk"), Some("lmnop"), Some("qrs")]);
let list_array = list_builder.finish();
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let test_cases = TestCases::default()
.with_range(0..2)
.with_range(0..3)
.with_range(1..3)
.with_indices(vec![1, 3])
.with_indices(vec![2])
.with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, field_metadata)
.await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_simple_string_list_no_null(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let items_builder = StringBuilder::new();
let mut list_builder = ListBuilder::new(items_builder);
list_builder.append_value([Some("a"), Some("bc"), Some("def")]);
list_builder.append_value([Some("gh"), Some("zxy")]);
list_builder.append_value([Some("gh"), Some("z")]);
list_builder.append_value([Some("ijk"), Some("lmnop"), Some("qrs")]);
let list_array = list_builder.finish();
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let test_cases = TestCases::default()
.with_range(0..2)
.with_range(0..3)
.with_range(1..3)
.with_indices(vec![1, 3])
.with_indices(vec![2])
.with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, field_metadata)
.await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_simple_sliced_list(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let items_builder = Int32Builder::new();
let mut list_builder = ListBuilder::new(items_builder);
list_builder.append_value([Some(1), Some(2), Some(3)]);
list_builder.append_value([Some(4), Some(5)]);
list_builder.append_null();
list_builder.append_value([Some(6), Some(7), Some(8)]);
let list_array = list_builder.finish();
let list_array = list_array.slice(1, 2);
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let test_cases = TestCases::default()
.with_range(0..2)
.with_range(1..2)
.with_indices(vec![0])
.with_indices(vec![1])
.with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, field_metadata)
.await;
}
#[test_log::test(tokio::test)]
async fn test_simple_list_dict() {
let values = LargeStringArray::from_iter_values(["a", "bb", "ccc"]);
let indices = UInt8Array::from(vec![0, 1, 2, 0, 1, 2, 0, 1, 2]);
let dict_array = DictionaryArray::new(indices, Arc::new(values));
let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 6, 9]));
let list_array = ListArray::new(
Arc::new(Field::new("item", dict_array.data_type().clone(), true)),
offsets,
Arc::new(dict_array),
None,
);
let test_cases = TestCases::default()
.with_range(0..2)
.with_range(1..3)
.with_range(2..4)
.with_indices(vec![1])
.with_indices(vec![2]);
check_round_trip_encoding_of_data(
vec![Arc::new(list_array)],
&test_cases,
HashMap::default(),
)
.await;
}
#[test_log::test(tokio::test)]
async fn test_simple_list_all_null() {
let items = UInt64Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
let offsets = ScalarBuffer::<i32>::from(vec![0, 5, 8, 10]);
let offsets = OffsetBuffer::new(offsets);
let list_validity = NullBuffer::new(BooleanBuffer::from(vec![false, false, false]));
let list_arr = ListArray::new(
Arc::new(Field::new("item", DataType::UInt64, false)),
offsets,
Arc::new(items),
Some(list_validity),
);
let test_cases = TestCases::default()
.with_range(0..3)
.with_range(1..2)
.with_indices(vec![1])
.with_indices(vec![2])
.with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(
vec![Arc::new(list_arr)],
&test_cases,
HashMap::default(),
)
.await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_list_with_garbage_nulls(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let items = UInt64Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
let offsets = ScalarBuffer::<i32>::from(vec![0, 5, 8, 10]);
let offsets = OffsetBuffer::new(offsets);
let list_validity = NullBuffer::new(BooleanBuffer::from(vec![true, false, true]));
let list_arr = ListArray::new(
Arc::new(Field::new("item", DataType::UInt64, true)),
offsets,
Arc::new(items),
Some(list_validity),
);
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let test_cases = TestCases::default()
.with_range(0..3)
.with_range(1..2)
.with_indices(vec![1])
.with_indices(vec![2])
.with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(vec![Arc::new(list_arr)], &test_cases, field_metadata)
.await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_simple_two_page_list(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let items_builder = Int64Builder::new();
let mut list_builder = ListBuilder::new(items_builder);
for i in 0..512 {
list_builder.append_value([Some(i), Some(i * 2)]);
}
let list_array_1 = list_builder.finish();
let items_builder = Int64Builder::new();
let mut list_builder = ListBuilder::new(items_builder);
for i in 0..512 {
let i = i + 512;
list_builder.append_value([Some(i), Some(i * 2)]);
}
let list_array_2 = list_builder.finish();
let mut metadata = HashMap::new();
metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let test_cases = TestCases::default()
.with_min_file_version(LanceFileVersion::V2_1)
.with_page_sizes(vec![100])
.with_range(800..900);
check_round_trip_encoding_of_data(
vec![Arc::new(list_array_1), Arc::new(list_array_2)],
&test_cases,
metadata,
)
.await;
}
#[test_log::test(tokio::test)]
async fn test_simple_large_list() {
let items_builder = Int32Builder::new();
let mut list_builder = LargeListBuilder::new(items_builder);
list_builder.append_value([Some(1), Some(2), Some(3)]);
list_builder.append_value([Some(4), Some(5)]);
list_builder.append_null();
list_builder.append_value([Some(6), Some(7), Some(8)]);
let list_array = list_builder.finish();
let test_cases = TestCases::default()
.with_range(0..2)
.with_range(0..3)
.with_range(1..3)
.with_indices(vec![1, 3]);
check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, HashMap::new())
.await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_empty_lists(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let values = [vec![Some(1), Some(2), Some(3)], vec![], vec![None]];
for order in [[0, 1, 2], [1, 0, 2], [2, 0, 1]] {
let items_builder = Int32Builder::new();
let mut list_builder = ListBuilder::new(items_builder);
for idx in order {
list_builder.append_value(values[idx].clone());
}
let list_array = Arc::new(list_builder.finish());
let test_cases = TestCases::default()
.with_indices(vec![1])
.with_indices(vec![0])
.with_indices(vec![2])
.with_indices(vec![0, 1]);
check_round_trip_encoding_of_data(
vec![list_array.clone()],
&test_cases,
field_metadata.clone(),
)
.await;
let test_cases = test_cases.with_batch_size(1);
check_round_trip_encoding_of_data(
vec![list_array],
&test_cases,
field_metadata.clone(),
)
.await;
}
let items_builder = Int32Builder::new();
let mut list_builder = ListBuilder::new(items_builder);
list_builder.append(true);
list_builder.append_null();
list_builder.append(true);
let list_array = Arc::new(list_builder.finish());
let test_cases = TestCases::default().with_range(0..2).with_indices(vec![1]);
check_round_trip_encoding_of_data(
vec![list_array.clone()],
&test_cases,
field_metadata.clone(),
)
.await;
let test_cases = test_cases.with_batch_size(1);
check_round_trip_encoding_of_data(vec![list_array], &test_cases, field_metadata.clone())
.await;
let items_builder = StringBuilder::new();
let mut list_builder = ListBuilder::new(items_builder);
list_builder.append(true);
list_builder.append_null();
list_builder.append(true);
let list_array = Arc::new(list_builder.finish());
let test_cases = TestCases::default().with_range(0..2).with_indices(vec![1]);
check_round_trip_encoding_of_data(
vec![list_array.clone()],
&test_cases,
field_metadata.clone(),
)
.await;
let test_cases = test_cases.with_batch_size(1);
check_round_trip_encoding_of_data(vec![list_array], &test_cases, field_metadata.clone())
.await;
let items_builder = Int32Builder::new();
let mut list_builder = ListBuilder::new(items_builder);
list_builder.append_null();
list_builder.append_null();
list_builder.append_null();
let list_array = Arc::new(list_builder.finish());
let test_cases = TestCases::default().with_range(0..2).with_indices(vec![1]);
check_round_trip_encoding_of_data(
vec![list_array.clone()],
&test_cases,
field_metadata.clone(),
)
.await;
let test_cases = test_cases.with_batch_size(1);
check_round_trip_encoding_of_data(vec![list_array], &test_cases, field_metadata.clone())
.await;
let items_builder = Int32Builder::new();
let mut list_builder = ListBuilder::new(items_builder);
list_builder.append_null();
list_builder.append_null();
list_builder.append_null();
let list_array = Arc::new(list_builder.finish());
let struct_validity = NullBuffer::new(BooleanBuffer::from(vec![true, false, true]));
let struct_array = Arc::new(StructArray::new(
Fields::from(vec![Field::new(
"lists",
list_array.data_type().clone(),
true,
)]),
vec![list_array],
Some(struct_validity),
));
let test_cases = TestCases::default()
.with_range(0..2)
.with_indices(vec![1])
.with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(
vec![struct_array.clone()],
&test_cases,
field_metadata.clone(),
)
.await;
let test_cases = test_cases.with_batch_size(1);
check_round_trip_encoding_of_data(vec![struct_array], &test_cases, field_metadata.clone())
.await;
}
#[test_log::test(tokio::test)]
async fn test_empty_list_list() {
let items_builder = Int32Builder::new();
let list_builder = ListBuilder::new(items_builder);
let mut outer_list_builder = ListBuilder::new(list_builder);
outer_list_builder.append_null();
outer_list_builder.append_null();
outer_list_builder.append_null();
let list_array = Arc::new(outer_list_builder.finish());
let test_cases = TestCases::default().with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(vec![list_array], &test_cases, HashMap::new()).await;
}
#[test_log::test(tokio::test)]
#[ignore] async fn test_jumbo_list() {
let items = BooleanArray::new_null(1024 * 1024);
let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0, 1024 * 1024]));
let list_arr = Arc::new(ListArray::new(
Arc::new(Field::new("item", DataType::Boolean, true)),
offsets,
Arc::new(items),
None,
)) as ArrayRef;
let arrs = vec![list_arr; 5000];
let test_cases = TestCases::default().without_validation();
check_round_trip_encoding_of_data(arrs, &test_cases, HashMap::new()).await;
}
#[tokio::test]
async fn test_fuzz_issue_4466() {
let list_sizes = vec![
13, 18, 12, 7, 14, 12, 6, 13, 18, 8, 6, 11, 17, 12, 8, 19, 5, 6, 10, 13, 8, 6, 10, 4, 8, 16, 14, 12, 18, 9, 17, 8, 14, 18, 15, 3, 2, 4, 5, 1, 3, 13, 1, 2, 10, 4, 10, 18, 7, 14, 18, 13, 9, 17, 3, 13, 10, 14, 8, 19, 17, 10, 5, 11, 6, 15, 10, 18, 18, 20, 16, 11, 12, 15, 7, 9, 3, 10, 20, 5, 2, 3, 17, 4, 8, 12, 15, 6, 3, 20, 15, 20, 1, 19, 16, ];
let mut list_builder = ListBuilder::new(Int32Builder::new());
let mut total_values = 0;
for size in &list_sizes {
for i in 0..*size {
list_builder.values().append_value(i);
}
list_builder.append(true);
total_values += size;
}
let list_array = Arc::new(list_builder.finish());
assert_eq!(list_array.len(), 95);
assert_eq!(total_values, 1025);
let test_cases = TestCases::default().with_min_file_version(LanceFileVersion::V2_1);
check_round_trip_encoding_of_data(vec![list_array], &test_cases, HashMap::new()).await;
}
#[rstest]
#[test_log::test(tokio::test)]
async fn test_sparse_large_string_list(
#[values(STRUCTURAL_ENCODING_MINIBLOCK, STRUCTURAL_ENCODING_FULLZIP)]
structural_encoding: &str,
) {
let num_rows = 2_500_000u32;
let num_non_empty = 100u32;
let strings_per_list = 10;
let items_builder = StringBuilder::new();
let mut list_builder = ListBuilder::new(items_builder);
let step = num_rows / num_non_empty;
let mut next_non_empty = step / 2;
for i in 0..num_rows {
if i == next_non_empty {
let vals: Vec<Option<&str>> = (0..strings_per_list)
.map(|j| match j % 4 {
0 => Some("a"),
1 => Some("bb"),
2 => Some("ccc"),
_ => Some("d"),
})
.collect();
list_builder.append_value(vals);
next_non_empty = next_non_empty.saturating_add(step);
} else {
list_builder.append_value([] as [Option<&str>; 0]);
}
}
let list_array = list_builder.finish();
let mut field_metadata = HashMap::new();
field_metadata.insert(
STRUCTURAL_ENCODING_META_KEY.to_string(),
structural_encoding.into(),
);
let test_cases = TestCases::default()
.with_range(0..1000)
.with_range(0..num_rows as u64)
.with_indices(vec![0, (step / 2) as u64, num_rows as u64 - 1])
.with_max_file_version(LanceFileVersion::V2_2);
check_round_trip_encoding_of_data(vec![Arc::new(list_array)], &test_cases, field_metadata)
.await;
}
}