vortex_array/arrays/
arbitrary.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::iter;
5use std::sync::Arc;
6
7use arbitrary::{Arbitrary, Result, Unstructured};
8use arrow_buffer::BooleanBuffer;
9use vortex_buffer::Buffer;
10use vortex_dtype::{DType, IntegerPType, NativePType, Nullability, PType};
11use vortex_error::{VortexExpect, VortexUnwrap};
12use vortex_scalar::arbitrary::random_scalar;
13use vortex_scalar::{Scalar, match_each_decimal_value_type};
14
15use super::{BoolArray, ChunkedArray, NullArray, PrimitiveArray, StructArray};
16use crate::arrays::{VarBinArray, VarBinViewArray, smallest_decimal_value_type};
17use crate::builders::{ArrayBuilder, DecimalBuilder, FixedSizeListBuilder, ListViewBuilder};
18use crate::validity::Validity;
19use crate::{Array, ArrayRef, IntoArray, ToCanonical};
20
21/// A wrapper type to implement `Arbitrary` for `ArrayRef`.
22#[derive(Clone, Debug)]
23pub struct ArbitraryArray(pub ArrayRef);
24
25impl<'a> Arbitrary<'a> for ArbitraryArray {
26    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
27        let dtype = u.arbitrary()?;
28        Self::arbitrary_with(u, None, &dtype)
29    }
30}
31
32impl ArbitraryArray {
33    pub fn arbitrary_with(u: &mut Unstructured, len: Option<usize>, dtype: &DType) -> Result<Self> {
34        random_array(u, dtype, len).map(ArbitraryArray)
35    }
36}
37
38fn split_number_into_parts(n: usize, parts: usize) -> Vec<usize> {
39    let reminder = n % parts;
40    let division = (n - reminder) / parts;
41    iter::repeat_n(division, parts - reminder)
42        .chain(iter::repeat_n(division + 1, reminder))
43        .collect()
44}
45
46/// Creates a random array with a random number of chunks.
47fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Result<ArrayRef> {
48    let num_chunks = u.int_in_range(1..=3)?;
49    let chunk_lens = len.map(|l| split_number_into_parts(l, num_chunks));
50    let mut chunks = (0..num_chunks)
51        .map(|i| {
52            let chunk_len = chunk_lens.as_ref().map(|c| c[i]);
53            random_array_chunk(u, dtype, chunk_len)
54        })
55        .collect::<Result<Vec<_>>>()?;
56
57    if chunks.len() == 1 {
58        Ok(chunks.remove(0))
59    } else {
60        let dtype = chunks[0].dtype().clone();
61        Ok(ChunkedArray::try_new(chunks, dtype)
62            .vortex_unwrap()
63            .into_array())
64    }
65}
66
67/// Creates a random array chunk.
68fn random_array_chunk(
69    u: &mut Unstructured<'_>,
70    dtype: &DType,
71    chunk_len: Option<usize>,
72) -> Result<ArrayRef> {
73    match dtype {
74        DType::Null => Ok(NullArray::new(
75            chunk_len
76                .map(Ok)
77                .unwrap_or_else(|| u.int_in_range(0..=100))?,
78        )
79        .into_array()),
80        DType::Bool(n) => random_bool(u, *n, chunk_len),
81        DType::Primitive(ptype, n) => match ptype {
82            PType::U8 => random_primitive::<u8>(u, *n, chunk_len),
83            PType::U16 => random_primitive::<u16>(u, *n, chunk_len),
84            PType::U32 => random_primitive::<u32>(u, *n, chunk_len),
85            PType::U64 => random_primitive::<u64>(u, *n, chunk_len),
86            PType::I8 => random_primitive::<i8>(u, *n, chunk_len),
87            PType::I16 => random_primitive::<i16>(u, *n, chunk_len),
88            PType::I32 => random_primitive::<i32>(u, *n, chunk_len),
89            PType::I64 => random_primitive::<i64>(u, *n, chunk_len),
90            PType::F16 => Ok(random_primitive::<u16>(u, *n, chunk_len)?
91                .to_primitive()
92                .reinterpret_cast(PType::F16)
93                .into_array()),
94            PType::F32 => random_primitive::<f32>(u, *n, chunk_len),
95            PType::F64 => random_primitive::<f64>(u, *n, chunk_len),
96        },
97        DType::Decimal(decimal, n) => {
98            let elem_len = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
99            match_each_decimal_value_type!(smallest_decimal_value_type(decimal), |DVT| {
100                let mut builder =
101                    DecimalBuilder::new::<DVT>(decimal.precision(), decimal.scale(), *n);
102                for _i in 0..elem_len {
103                    let random_decimal = random_scalar(u, &DType::Decimal(*decimal, *n))?;
104                    builder.append_scalar(&random_decimal).vortex_expect(
105                        "was somehow unable to append a decimal to a decimal builder",
106                    );
107                }
108                Ok(builder.finish())
109            })
110        }
111        DType::Utf8(n) => random_string(u, *n, chunk_len),
112        DType::Binary(n) => random_bytes(u, *n, chunk_len),
113        DType::Struct(sdt, n) => {
114            let first_array = sdt
115                .fields()
116                .next()
117                .map(|d| random_array(u, &d, chunk_len))
118                .transpose()?;
119            let resolved_len = first_array
120                .as_ref()
121                .map(|a| a.len())
122                .or(chunk_len)
123                .map(Ok)
124                .unwrap_or_else(|| u.int_in_range(0..=100))?;
125            let children = first_array
126                .into_iter()
127                .map(Ok)
128                .chain(
129                    sdt.fields()
130                        .skip(1)
131                        .map(|d| random_array(u, &d, Some(resolved_len))),
132                )
133                .collect::<Result<Vec<_>>>()?;
134            Ok(StructArray::try_new(
135                sdt.names().clone(),
136                children,
137                resolved_len,
138                random_validity(u, *n, resolved_len)?,
139            )
140            .vortex_unwrap()
141            .into_array())
142        }
143        DType::List(elem_dtype, null) => random_list(u, elem_dtype, *null, chunk_len),
144        DType::FixedSizeList(elem_dtype, list_size, null) => {
145            random_fixed_size_list(u, elem_dtype, *list_size, *null, chunk_len)
146        }
147        DType::Extension(..) => {
148            todo!("Extension arrays are not implemented")
149        }
150    }
151}
152
153/// Creates a random fixed-size list array.
154///
155/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
156fn random_fixed_size_list(
157    u: &mut Unstructured,
158    elem_dtype: &Arc<DType>,
159    list_size: u32,
160    null: Nullability,
161    chunk_len: Option<usize>,
162) -> Result<ArrayRef> {
163    let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
164
165    let mut builder =
166        FixedSizeListBuilder::with_capacity(elem_dtype.clone(), list_size, null, array_length);
167
168    for _ in 0..array_length {
169        if null == Nullability::Nullable && u.arbitrary::<bool>()? {
170            builder.append_null();
171        } else {
172            builder
173                .append_value(random_list_scalar(u, elem_dtype, list_size, null)?.as_list())
174                .vortex_expect("can append value");
175        }
176    }
177
178    Ok(builder.finish())
179}
180
181/// Creates a random list array.
182///
183/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
184fn random_list(
185    u: &mut Unstructured,
186    elem_dtype: &Arc<DType>,
187    null: Nullability,
188    chunk_len: Option<usize>,
189) -> Result<ArrayRef> {
190    match u.int_in_range(0..=5)? {
191        0 => random_list_with_offset_type::<i16>(u, elem_dtype, null, chunk_len),
192        1 => random_list_with_offset_type::<i32>(u, elem_dtype, null, chunk_len),
193        2 => random_list_with_offset_type::<i64>(u, elem_dtype, null, chunk_len),
194        3 => random_list_with_offset_type::<u16>(u, elem_dtype, null, chunk_len),
195        4 => random_list_with_offset_type::<u32>(u, elem_dtype, null, chunk_len),
196        5 => random_list_with_offset_type::<u64>(u, elem_dtype, null, chunk_len),
197        _ => unreachable!("int_in_range returns a value in the above range"),
198    }
199}
200
201/// Creates a random list array with the given [`IntegerPType`] for the internal offsets child.
202///
203/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
204fn random_list_with_offset_type<O: IntegerPType>(
205    u: &mut Unstructured,
206    elem_dtype: &Arc<DType>,
207    null: Nullability,
208    chunk_len: Option<usize>,
209) -> Result<ArrayRef> {
210    let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
211
212    let mut builder = ListViewBuilder::<O, O>::with_capacity(elem_dtype.clone(), null, 20, 10);
213
214    for _ in 0..array_length {
215        if null == Nullability::Nullable && u.arbitrary::<bool>()? {
216            builder.append_null();
217        } else {
218            let list_size = u.int_in_range(0..=20)?;
219            builder
220                .append_value(random_list_scalar(u, elem_dtype, list_size, null)?.as_list())
221                .vortex_expect("can append value");
222        }
223    }
224
225    Ok(builder.finish())
226}
227
228/// Creates a random list scalar with the specified list size.
229fn random_list_scalar(
230    u: &mut Unstructured,
231    elem_dtype: &Arc<DType>,
232    list_size: u32,
233    null: Nullability,
234) -> Result<Scalar> {
235    let elems = (0..list_size)
236        .map(|_| random_scalar(u, elem_dtype))
237        .collect::<Result<Vec<_>>>()?;
238    Ok(Scalar::list(elem_dtype.clone(), elems, null))
239}
240
241fn random_string(
242    u: &mut Unstructured,
243    nullability: Nullability,
244    len: Option<usize>,
245) -> Result<ArrayRef> {
246    match nullability {
247        Nullability::NonNullable => {
248            let v = arbitrary_vec_of_len::<String>(u, len)?;
249            Ok(match u.int_in_range(0..=1)? {
250                0 => VarBinArray::from_vec(v, DType::Utf8(Nullability::NonNullable)).into_array(),
251                1 => VarBinViewArray::from_iter_str(v).into_array(),
252                _ => unreachable!(),
253            })
254        }
255        Nullability::Nullable => {
256            let v = arbitrary_vec_of_len::<Option<String>>(u, len)?;
257            Ok(match u.int_in_range(0..=1)? {
258                0 => VarBinArray::from_iter(v, DType::Utf8(Nullability::Nullable)).into_array(),
259                1 => VarBinViewArray::from_iter_nullable_str(v).into_array(),
260                _ => unreachable!(),
261            })
262        }
263    }
264}
265
266fn random_bytes(
267    u: &mut Unstructured,
268    nullability: Nullability,
269    len: Option<usize>,
270) -> Result<ArrayRef> {
271    match nullability {
272        Nullability::NonNullable => {
273            let v = arbitrary_vec_of_len::<Vec<u8>>(u, len)?;
274            Ok(match u.int_in_range(0..=1)? {
275                0 => VarBinArray::from_vec(v, DType::Binary(Nullability::NonNullable)).into_array(),
276                1 => VarBinViewArray::from_iter_bin(v).into_array(),
277                _ => unreachable!(),
278            })
279        }
280        Nullability::Nullable => {
281            let v = arbitrary_vec_of_len::<Option<Vec<u8>>>(u, len)?;
282            Ok(match u.int_in_range(0..=1)? {
283                0 => VarBinArray::from_iter(v, DType::Binary(Nullability::Nullable)).into_array(),
284                1 => VarBinViewArray::from_iter_nullable_bin(v).into_array(),
285                _ => unreachable!(),
286            })
287        }
288    }
289}
290
291fn random_primitive<'a, T: Arbitrary<'a> + NativePType>(
292    u: &mut Unstructured<'a>,
293    nullability: Nullability,
294    len: Option<usize>,
295) -> Result<ArrayRef> {
296    let v = arbitrary_vec_of_len::<T>(u, len)?;
297    let validity = random_validity(u, nullability, v.len())?;
298    Ok(PrimitiveArray::new(Buffer::copy_from(v), validity).into_array())
299}
300
301fn random_bool(
302    u: &mut Unstructured,
303    nullability: Nullability,
304    len: Option<usize>,
305) -> Result<ArrayRef> {
306    let v = arbitrary_vec_of_len(u, len)?;
307    let validity = random_validity(u, nullability, v.len())?;
308    Ok(BoolArray::from_bool_buffer(BooleanBuffer::from(v), validity).into_array())
309}
310
311fn random_validity(u: &mut Unstructured, nullability: Nullability, len: usize) -> Result<Validity> {
312    match nullability {
313        Nullability::NonNullable => Ok(Validity::NonNullable),
314        Nullability::Nullable => Ok(match u.int_in_range(0..=2)? {
315            0 => Validity::AllValid,
316            1 => Validity::AllInvalid,
317            2 => Validity::from_iter(arbitrary_vec_of_len::<bool>(u, Some(len))?),
318            _ => unreachable!(),
319        }),
320    }
321}
322
323fn arbitrary_vec_of_len<'a, T: Arbitrary<'a>>(
324    u: &mut Unstructured<'a>,
325    len: Option<usize>,
326) -> Result<Vec<T>> {
327    len.map(|l| (0..l).map(|_| T::arbitrary(u)).collect::<Result<Vec<_>>>())
328        .unwrap_or_else(|| Vec::<T>::arbitrary(u))
329}