use crate::utils::make_scalar_function;
use arrow::array::{
Array, ArrayRef, Capacities, GenericListArray, Int64Array, MutableArrayData,
NullBufferBuilder, OffsetSizeTrait, new_null_array,
};
use arrow::buffer::OffsetBuffer;
use arrow::datatypes::DataType;
use arrow::datatypes::{ArrowNativeType, Field};
use arrow::datatypes::{
DataType::{LargeList, List},
FieldRef,
};
use datafusion_common::cast::{as_int64_array, as_large_list_array, as_list_array};
use datafusion_common::utils::ListCoercion;
use datafusion_common::{Result, ScalarValue, exec_err, internal_datafusion_err};
use datafusion_expr::{
ArrayFunctionArgument, ArrayFunctionSignature, ColumnarValue, Documentation,
ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, Volatility,
};
use datafusion_macros::user_doc;
use std::sync::Arc;
make_udf_expr_and_func!(
ArrayResize,
array_resize,
array size value,
"returns an array with the specified size filled with the given value.",
array_resize_udf
);
#[user_doc(
doc_section(label = "Array Functions"),
description = "Resizes the list to contain size elements. Initializes new elements with value or empty if value is not set.",
syntax_example = "array_resize(array, size, value)",
sql_example = r#"```sql
> select array_resize([1, 2, 3], 5, 0);
+-------------------------------------+
| array_resize(List([1,2,3],5,0)) |
+-------------------------------------+
| [1, 2, 3, 0, 0] |
+-------------------------------------+
```"#,
argument(
name = "array",
description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
),
argument(name = "size", description = "New size of given array."),
argument(
name = "value",
description = "Defines new elements' value or empty if value is not set."
)
)]
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct ArrayResize {
signature: Signature,
aliases: Vec<String>,
}
impl Default for ArrayResize {
fn default() -> Self {
Self::new()
}
}
impl ArrayResize {
pub fn new() -> Self {
Self {
signature: Signature::one_of(
vec![
TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
arguments: vec![
ArrayFunctionArgument::Array,
ArrayFunctionArgument::Index,
],
array_coercion: Some(ListCoercion::FixedSizedListToList),
}),
TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
arguments: vec![
ArrayFunctionArgument::Array,
ArrayFunctionArgument::Index,
ArrayFunctionArgument::Element,
],
array_coercion: Some(ListCoercion::FixedSizedListToList),
}),
],
Volatility::Immutable,
),
aliases: vec!["list_resize".to_string()],
}
}
}
impl ScalarUDFImpl for ArrayResize {
fn name(&self) -> &str {
"array_resize"
}
fn signature(&self) -> &Signature {
&self.signature
}
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
match &arg_types[0] {
List(field) => Ok(List(Arc::clone(field))),
LargeList(field) => Ok(LargeList(Arc::clone(field))),
DataType::Null => {
Ok(List(Arc::new(Field::new_list_field(DataType::Int64, true))))
}
_ => exec_err!(
"Not reachable, data_type should be List, LargeList or FixedSizeList"
),
}
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
make_scalar_function(array_resize_inner)(&args.args)
}
fn aliases(&self) -> &[String] {
&self.aliases
}
fn documentation(&self) -> Option<&Documentation> {
self.doc()
}
}
fn array_resize_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
if arg.len() < 2 || arg.len() > 3 {
return exec_err!("array_resize needs two or three arguments");
}
let array = &arg[0];
if array.logical_null_count() == array.len() {
let return_type = match array.data_type() {
List(field) => List(Arc::clone(field)),
LargeList(field) => LargeList(Arc::clone(field)),
_ => {
return exec_err!(
"array_resize does not support type '{:?}'.",
array.data_type()
);
}
};
return Ok(new_null_array(&return_type, array.len()));
}
let new_len = as_int64_array(&arg[1])?;
let new_element = if arg.len() == 3 {
Some(Arc::clone(&arg[2]))
} else {
None
};
match &arg[0].data_type() {
List(field) => {
let array = as_list_array(&arg[0])?;
general_list_resize::<i32>(array, new_len, field, new_element)
}
LargeList(field) => {
let array = as_large_list_array(&arg[0])?;
general_list_resize::<i64>(array, new_len, field, new_element)
}
array_type => exec_err!("array_resize does not support type '{array_type}'."),
}
}
fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
array: &GenericListArray<O>,
count_array: &Int64Array,
field: &FieldRef,
default_element: Option<ArrayRef>,
) -> Result<ArrayRef> {
let data_type = array.value_type();
let values = array.values();
let original_data = values.to_data();
let mut max_extra: usize = 0;
let mut output_values_len: usize = 0;
for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
if array.is_null(row_index) {
continue;
}
let target_count = count_array.value(row_index).to_usize().ok_or_else(|| {
internal_datafusion_err!("array_resize: failed to convert size to usize")
})?;
output_values_len =
output_values_len.checked_add(target_count).ok_or_else(|| {
internal_datafusion_err!("array_resize: output size overflow")
})?;
let current_len = (offset_window[1] - offset_window[0]).to_usize().unwrap();
if target_count > current_len {
max_extra = max_extra.max(target_count - current_len);
}
}
let use_bulk_fill = max_extra > 0
&& match &default_element {
None => true,
Some(fill_array) => {
let len = fill_array.len();
let null_count = fill_array.logical_null_count();
len <= 1
|| null_count == len
|| (null_count == 0 && {
let first = fill_array.slice(0, 1);
(1..len)
.all(|i| fill_array.slice(i, 1).as_ref() == first.as_ref())
})
}
};
if use_bulk_fill {
let fill_scalar = match &default_element {
None => ScalarValue::try_from(&data_type)?,
Some(fill_array) if fill_array.logical_null_count() == fill_array.len() => {
ScalarValue::try_from(&data_type)?
}
Some(fill_array) => ScalarValue::try_from_array(fill_array.as_ref(), 0)?,
};
let fill_values = fill_scalar.to_array_of_size(max_extra)?;
let default_value_data = fill_values.to_data();
build_resized_list(
array,
count_array,
field,
&original_data,
&default_value_data,
output_values_len,
|mutable, _, extra_count| mutable.extend(1, 0, extra_count),
)
} else {
let fill_values = match default_element {
Some(fill_values) => fill_values,
None => {
let null_scalar = ScalarValue::try_from(&data_type)?;
null_scalar.to_array_of_size(original_data.len())?
}
};
let default_value_data = fill_values.to_data();
build_resized_list(
array,
count_array,
field,
&original_data,
&default_value_data,
output_values_len,
|mutable, row_index, extra_count| {
for _ in 0..extra_count {
mutable.extend(1, row_index, row_index + 1);
}
},
)
}
}
fn build_resized_list<O, F>(
array: &GenericListArray<O>,
count_array: &Int64Array,
field: &FieldRef,
original_data: &arrow::array::ArrayData,
default_value_data: &arrow::array::ArrayData,
output_values_len: usize,
mut append_fill_values: F,
) -> Result<ArrayRef>
where
O: OffsetSizeTrait + TryInto<i64>,
F: FnMut(&mut MutableArrayData, usize, usize),
{
let capacity = Capacities::Array(output_values_len);
let mut offsets = vec![O::usize_as(0)];
let mut mutable = MutableArrayData::with_capacities(
vec![original_data, default_value_data],
false,
capacity,
);
let mut null_builder = NullBufferBuilder::new(array.len());
for (row_index, offset_window) in array.offsets().windows(2).enumerate() {
if array.is_null(row_index) {
null_builder.append_null();
offsets.push(offsets[row_index]);
continue;
}
null_builder.append_non_null();
let count = count_array.value(row_index).to_usize().ok_or_else(|| {
internal_datafusion_err!("array_resize: failed to convert size to usize")
})?;
let count = O::usize_as(count);
let start = offset_window[0];
if start + count > offset_window[1] {
let extra_count = (start + count - offset_window[1]).to_usize().unwrap();
let end = offset_window[1];
mutable.extend(0, start.to_usize().unwrap(), end.to_usize().unwrap());
append_fill_values(&mut mutable, row_index, extra_count);
} else {
let end = start + count;
mutable.extend(0, start.to_usize().unwrap(), end.to_usize().unwrap());
};
offsets.push(offsets[row_index] + count);
}
let data = mutable.freeze();
Ok(Arc::new(GenericListArray::<O>::try_new(
Arc::clone(field),
OffsetBuffer::<O>::new(offsets.into()),
arrow::array::make_array(data),
null_builder.finish(),
)?))
}