use crate::array::*;
use crate::datatypes::*;
use crate::util::test_util::seedable_rng;
use arrow_buffer::{Buffer, IntervalMonthDayNano};
use half::f16;
use rand::distr::uniform::SampleUniform;
use rand::rng;
use rand::Rng;
use rand::SeedableRng;
use rand::{
distr::{Alphanumeric, Distribution, StandardUniform},
prelude::StdRng,
};
use std::ops::Range;
pub fn create_primitive_array<T>(size: usize, null_density: f32) -> PrimitiveArray<T>
where
T: ArrowPrimitiveType,
StandardUniform: Distribution<T::Native>,
{
let mut rng = seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
Some(rng.random())
}
})
.collect()
}
pub fn create_primitive_array_with_seed<T>(
size: usize,
null_density: f32,
seed: u64,
) -> PrimitiveArray<T>
where
T: ArrowPrimitiveType,
StandardUniform: Distribution<T::Native>,
{
let mut rng = StdRng::seed_from_u64(seed);
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
Some(rng.random())
}
})
.collect()
}
pub fn create_month_day_nano_array_with_seed(
size: usize,
null_density: f32,
seed: u64,
) -> IntervalMonthDayNanoArray {
let mut rng = StdRng::seed_from_u64(seed);
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
Some(IntervalMonthDayNano::new(
rng.random(),
rng.random(),
rng.random(),
))
}
})
.collect()
}
pub fn create_boolean_array(size: usize, null_density: f32, true_density: f32) -> BooleanArray
where
StandardUniform: Distribution<bool>,
{
let mut rng = seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let value = rng.random::<f32>() < true_density;
Some(value)
}
})
.collect()
}
pub fn create_string_array<Offset: OffsetSizeTrait>(
size: usize,
null_density: f32,
) -> GenericStringArray<Offset> {
create_string_array_with_max_len(size, null_density, 400)
}
pub fn create_longer_string_array_with_same_prefix<Offset: OffsetSizeTrait>(
size: usize,
null_density: f32,
) -> GenericStringArray<Offset> {
create_string_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
}
pub fn create_longer_string_view_array_with_same_prefix(
size: usize,
null_density: f32,
) -> StringViewArray {
create_string_view_array_with_len_range_and_prefix(size, null_density, 13, 100, "prefix_")
}
fn create_string_array_with_len_range_and_prefix<Offset: OffsetSizeTrait>(
size: usize,
null_density: f32,
min_str_len: usize,
max_str_len: usize,
prefix: &str,
) -> GenericStringArray<Offset> {
assert!(
min_str_len <= max_str_len,
"min_str_len must be <= max_str_len"
);
assert!(
prefix.len() <= max_str_len,
"Prefix length must be <= max_str_len"
);
let rng = &mut seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let remaining_len = rng.random_range(
min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
);
let mut value = prefix.to_string();
value.extend(
rng.sample_iter(&Alphanumeric)
.take(remaining_len)
.map(char::from),
);
Some(value)
}
})
.collect()
}
fn create_string_view_array_with_len_range_and_prefix(
size: usize,
null_density: f32,
min_str_len: usize,
max_str_len: usize,
prefix: &str,
) -> StringViewArray {
assert!(
min_str_len <= max_str_len,
"min_str_len must be <= max_str_len"
);
assert!(
prefix.len() <= max_str_len,
"Prefix length must be <= max_str_len"
);
let rng = &mut seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let remaining_len = rng.random_range(
min_str_len.saturating_sub(prefix.len())..=(max_str_len - prefix.len()),
);
let mut value = prefix.to_string();
value.extend(
rng.sample_iter(&Alphanumeric)
.take(remaining_len)
.map(char::from),
);
Some(value)
}
})
.collect()
}
fn create_string_array_with_max_len<Offset: OffsetSizeTrait>(
size: usize,
null_density: f32,
max_str_len: usize,
) -> GenericStringArray<Offset> {
let rng = &mut seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let str_len = rng.random_range(0..max_str_len);
let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
let value = String::from_utf8(value).unwrap();
Some(value)
}
})
.collect()
}
pub fn create_string_array_with_len<Offset: OffsetSizeTrait>(
size: usize,
null_density: f32,
str_len: usize,
) -> GenericStringArray<Offset> {
let rng = &mut seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
let value = String::from_utf8(value).unwrap();
Some(value)
}
})
.collect()
}
pub fn create_string_view_array(size: usize, null_density: f32) -> StringViewArray {
create_string_view_array_with_max_len(size, null_density, 400)
}
fn create_string_view_array_with_max_len(
size: usize,
null_density: f32,
max_str_len: usize,
) -> StringViewArray {
let rng = &mut seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let str_len = rng.random_range(0..max_str_len);
let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
let value = String::from_utf8(value).unwrap();
Some(value)
}
})
.collect()
}
pub fn create_string_view_array_with_len(
size: usize,
null_density: f32,
str_len: usize,
mixed: bool,
) -> StringViewArray {
let rng = &mut seedable_rng();
let mut lengths = Vec::with_capacity(size);
if mixed {
for _ in 0..size / 2 {
lengths.push(rng.random_range(1..12));
}
for _ in size / 2..size {
lengths.push(rng.random_range(12..=std::cmp::max(30, str_len)));
}
} else {
lengths.resize(size, str_len);
}
lengths
.into_iter()
.map(|len| {
if rng.random::<f32>() < null_density {
None
} else {
let value: Vec<u8> = rng.sample_iter(&Alphanumeric).take(len).collect();
Some(String::from_utf8(value).unwrap())
}
})
.collect()
}
pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
size: usize,
null_density: f32,
str_len: usize,
) -> DictionaryArray<K> {
let rng = &mut seedable_rng();
let data: Vec<_> = (0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
let value = String::from_utf8(value).unwrap();
Some(value)
}
})
.collect();
data.iter().map(|x| x.as_deref()).collect()
}
pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
logical_array_len: usize,
physical_array_len: usize,
) -> RunArray<R> {
assert!(logical_array_len >= physical_array_len);
let run_len = logical_array_len / physical_array_len;
let mut run_len_extra = logical_array_len % physical_array_len;
let mut values: Vec<V::Native> = (0..physical_array_len)
.flat_map(|s| {
let mut take_len = run_len;
if run_len_extra > 0 {
take_len += 1;
run_len_extra -= 1;
}
std::iter::repeat(V::Native::from_usize(s).unwrap()).take(take_len)
})
.collect();
while values.len() < logical_array_len {
let last_val = values[values.len() - 1];
values.push(last_val);
}
let mut builder = PrimitiveRunBuilder::<R, V>::with_capacity(physical_array_len);
builder.extend(values.into_iter().map(Some));
builder.finish()
}
pub fn create_string_array_for_runs(
physical_array_len: usize,
logical_array_len: usize,
string_len: usize,
) -> Vec<String> {
assert!(logical_array_len >= physical_array_len);
let mut rng = rng();
let run_len = logical_array_len / physical_array_len;
let mut run_len_extra = logical_array_len % physical_array_len;
let mut values: Vec<String> = (0..physical_array_len)
.map(|_| (0..string_len).map(|_| rng.random::<char>()).collect())
.flat_map(|s| {
let mut take_len = run_len;
if run_len_extra > 0 {
take_len += 1;
run_len_extra -= 1;
}
std::iter::repeat(s).take(take_len)
})
.collect();
while values.len() < logical_array_len {
let last_val = values[values.len() - 1].clone();
values.push(last_val);
}
values
}
pub fn create_binary_array<Offset: OffsetSizeTrait>(
size: usize,
null_density: f32,
) -> GenericBinaryArray<Offset> {
let rng = &mut seedable_rng();
let range_rng = &mut seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let value = rng
.sample_iter::<u8, _>(StandardUniform)
.take(range_rng.random_range(0..8))
.collect::<Vec<u8>>();
Some(value)
}
})
.collect()
}
pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray {
let rng = &mut seedable_rng();
FixedSizeBinaryArray::try_from_sparse_iter_with_size(
(0..size).map(|_| {
if rng.random::<f32>() < null_density {
None
} else {
let value = rng
.sample_iter::<u8, _>(StandardUniform)
.take(value_len)
.collect::<Vec<u8>>();
Some(value)
}
}),
value_len as i32,
)
.unwrap()
}
pub fn create_dict_from_values<K>(
size: usize,
null_density: f32,
values: &dyn Array,
) -> DictionaryArray<K>
where
K: ArrowDictionaryKeyType,
StandardUniform: Distribution<K::Native>,
K::Native: SampleUniform,
{
let min_key = K::Native::from_usize(0).unwrap();
let max_key = K::Native::from_usize(values.len()).unwrap();
create_sparse_dict_from_values(size, null_density, values, min_key..max_key)
}
pub fn create_sparse_dict_from_values<K>(
size: usize,
null_density: f32,
values: &dyn Array,
key_range: Range<K::Native>,
) -> DictionaryArray<K>
where
K: ArrowDictionaryKeyType,
StandardUniform: Distribution<K::Native>,
K::Native: SampleUniform,
{
let mut rng = seedable_rng();
let data_type =
DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
let keys: Buffer = (0..size)
.map(|_| rng.random_range(key_range.clone()))
.collect();
let nulls: Option<Buffer> = (null_density != 0.).then(|| {
(0..size)
.map(|_| rng.random_bool(null_density as _))
.collect()
});
let data = ArrayDataBuilder::new(data_type)
.len(size)
.null_bit_buffer(nulls)
.add_buffer(keys)
.add_child_data(values.to_data())
.build()
.unwrap();
DictionaryArray::from(data)
}
pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
let mut rng = seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < nan_density {
Some(f16::NAN)
} else {
Some(f16::from_f32(rng.random()))
}
})
.collect()
}
pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
let mut rng = seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < nan_density {
Some(f32::NAN)
} else {
Some(rng.random())
}
})
.collect()
}
pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
let mut rng = seedable_rng();
(0..size)
.map(|_| {
if rng.random::<f32>() < nan_density {
Some(f64::NAN)
} else {
Some(rng.random())
}
})
.collect()
}