use arrow::array::{ArrayRef, StringArray};
use arrow::datatypes::{DataType, Field};
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use datafusion_common::ScalarValue;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
use datafusion_functions_nested::string::StringToArray;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::hint::black_box;
use std::sync::Arc;
const NUM_ROWS: usize = 1000;
const SEED: u64 = 42;
fn criterion_benchmark(c: &mut Criterion) {
let comma = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string())));
bench_string_to_array(
c,
"string_to_array_single_char_delim",
create_csv_strings,
&comma,
None,
);
let double_colon = ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string())));
bench_string_to_array(
c,
"string_to_array_multi_char_delim",
create_multi_delim_strings,
&double_colon,
None,
);
let null_str = ColumnarValue::Scalar(ScalarValue::Utf8(Some("NULL".to_string())));
bench_string_to_array(
c,
"string_to_array_with_null_str",
create_csv_strings_with_nulls,
&comma,
Some(&null_str),
);
let null_delim = ColumnarValue::Scalar(ScalarValue::Utf8(None));
bench_string_to_array(
c,
"string_to_array_null_delim",
create_short_strings,
&null_delim,
None,
);
bench_string_to_array_columnar_delim(c);
}
fn bench_string_to_array_columnar_delim(c: &mut Criterion) {
let mut group = c.benchmark_group("string_to_array_columnar_delim");
for &num_elements in &[5, 20, 100] {
let string_array = create_csv_strings(num_elements);
let delimiter_array: ArrayRef =
Arc::new(StringArray::from(vec![Some(","); NUM_ROWS]));
let args = vec![
ColumnarValue::Array(string_array.clone()),
ColumnarValue::Array(delimiter_array),
];
let arg_fields = vec![
Field::new("str", DataType::Utf8, true).into(),
Field::new("delimiter", DataType::Utf8, false).into(),
];
let return_field = Field::new(
"result",
DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
true,
);
group.bench_with_input(
BenchmarkId::from_parameter(num_elements),
&num_elements,
|b, _| {
let udf = StringToArray::new();
b.iter(|| {
black_box(
udf.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
arg_fields: arg_fields.clone(),
number_rows: NUM_ROWS,
return_field: return_field.clone().into(),
config_options: Arc::new(ConfigOptions::default()),
})
.unwrap(),
)
})
},
);
}
group.finish();
}
fn bench_string_to_array(
c: &mut Criterion,
group_name: &str,
make_strings: fn(usize) -> ArrayRef,
delimiter: &ColumnarValue,
null_str: Option<&ColumnarValue>,
) {
let mut group = c.benchmark_group(group_name);
for &num_elements in &[5, 20, 100] {
let string_array = make_strings(num_elements);
let mut args = vec![
ColumnarValue::Array(string_array.clone()),
delimiter.clone(),
];
let mut arg_fields = vec![
Field::new("str", DataType::Utf8, true).into(),
Field::new("delimiter", DataType::Utf8, true).into(),
];
if let Some(ns) = null_str {
args.push(ns.clone());
arg_fields.push(Field::new("null_str", DataType::Utf8, true).into());
}
let return_field = Field::new(
"result",
DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
true,
);
group.bench_with_input(
BenchmarkId::from_parameter(num_elements),
&num_elements,
|b, _| {
let udf = StringToArray::new();
b.iter(|| {
black_box(
udf.invoke_with_args(ScalarFunctionArgs {
args: args.clone(),
arg_fields: arg_fields.clone(),
number_rows: NUM_ROWS,
return_field: return_field.clone().into(),
config_options: Arc::new(ConfigOptions::default()),
})
.unwrap(),
)
})
},
);
}
group.finish();
}
fn create_csv_strings(num_elements: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let strings: StringArray = (0..NUM_ROWS)
.map(|_| {
let parts: Vec<String> = (0..num_elements)
.map(|_| format!("val{}", rng.random_range(0..1000)))
.collect();
Some(parts.join(","))
})
.collect();
Arc::new(strings)
}
fn create_multi_delim_strings(num_elements: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let strings: StringArray = (0..NUM_ROWS)
.map(|_| {
let parts: Vec<String> = (0..num_elements)
.map(|_| format!("val{}", rng.random_range(0..1000)))
.collect();
Some(parts.join("::"))
})
.collect();
Arc::new(strings)
}
fn create_csv_strings_with_nulls(num_elements: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let strings: StringArray = (0..NUM_ROWS)
.map(|_| {
let parts: Vec<String> = (0..num_elements)
.map(|_| {
if rng.random::<f64>() < 0.1 {
"NULL".to_string()
} else {
format!("val{}", rng.random_range(0..1000))
}
})
.collect();
Some(parts.join(","))
})
.collect();
Arc::new(strings)
}
fn create_short_strings(num_chars: usize) -> ArrayRef {
let mut rng = StdRng::seed_from_u64(SEED);
let strings: StringArray = (0..NUM_ROWS)
.map(|_| {
let s: String = (0..num_chars)
.map(|_| rng.random_range(b'a'..=b'z') as char)
.collect();
Some(s)
})
.collect();
Arc::new(strings)
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);