vortex_array/arrays/
arbitrary.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::iter;
5use std::sync::Arc;
6
7use arbitrary::{Arbitrary, Result, Unstructured};
8use arrow_buffer::BooleanBuffer;
9use builders::ListBuilder;
10use vortex_buffer::Buffer;
11use vortex_dtype::{DType, NativePType, Nullability, PType};
12use vortex_error::{VortexExpect, VortexUnwrap};
13use vortex_scalar::arbitrary::random_scalar;
14use vortex_scalar::{Scalar, match_each_decimal_value_type};
15
16use super::{
17    BoolArray, ChunkedArray, NullArray, PrimitiveArray, StructArray, smallest_storage_type,
18};
19use crate::arrays::{VarBinArray, VarBinViewArray};
20use crate::builders::{ArrayBuilder, DecimalBuilder, FixedSizeListBuilder};
21use crate::validity::Validity;
22use crate::{Array, ArrayRef, IntoArray, OffsetPType, ToCanonical, builders};
23
24/// A wrapper type to implement `Arbitrary` for `ArrayRef`.
25#[derive(Clone, Debug)]
26pub struct ArbitraryArray(pub ArrayRef);
27
28impl<'a> Arbitrary<'a> for ArbitraryArray {
29    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
30        let dtype = u.arbitrary()?;
31        Self::arbitrary_with(u, None, &dtype)
32    }
33}
34
35impl ArbitraryArray {
36    pub fn arbitrary_with(u: &mut Unstructured, len: Option<usize>, dtype: &DType) -> Result<Self> {
37        random_array(u, dtype, len).map(ArbitraryArray)
38    }
39}
40
41fn split_number_into_parts(n: usize, parts: usize) -> Vec<usize> {
42    let reminder = n % parts;
43    let division = (n - reminder) / parts;
44    iter::repeat_n(division, parts - reminder)
45        .chain(iter::repeat_n(division + 1, reminder))
46        .collect()
47}
48
49/// Creates a random array with a random number of chunks.
50fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Result<ArrayRef> {
51    let num_chunks = u.int_in_range(1..=3)?;
52    let chunk_lens = len.map(|l| split_number_into_parts(l, num_chunks));
53    let mut chunks = (0..num_chunks)
54        .map(|i| {
55            let chunk_len = chunk_lens.as_ref().map(|c| c[i]);
56            random_array_chunk(u, dtype, chunk_len)
57        })
58        .collect::<Result<Vec<_>>>()?;
59
60    if chunks.len() == 1 {
61        Ok(chunks.remove(0))
62    } else {
63        let dtype = chunks[0].dtype().clone();
64        Ok(ChunkedArray::try_new(chunks, dtype)
65            .vortex_unwrap()
66            .into_array())
67    }
68}
69
70/// Creates a random array chunk.
71fn random_array_chunk(
72    u: &mut Unstructured<'_>,
73    dtype: &DType,
74    chunk_len: Option<usize>,
75) -> Result<ArrayRef> {
76    match dtype {
77        DType::Null => Ok(NullArray::new(
78            chunk_len
79                .map(Ok)
80                .unwrap_or_else(|| u.int_in_range(0..=100))?,
81        )
82        .into_array()),
83        DType::Bool(n) => random_bool(u, *n, chunk_len),
84        DType::Primitive(ptype, n) => match ptype {
85            PType::U8 => random_primitive::<u8>(u, *n, chunk_len),
86            PType::U16 => random_primitive::<u16>(u, *n, chunk_len),
87            PType::U32 => random_primitive::<u32>(u, *n, chunk_len),
88            PType::U64 => random_primitive::<u64>(u, *n, chunk_len),
89            PType::I8 => random_primitive::<i8>(u, *n, chunk_len),
90            PType::I16 => random_primitive::<i16>(u, *n, chunk_len),
91            PType::I32 => random_primitive::<i32>(u, *n, chunk_len),
92            PType::I64 => random_primitive::<i64>(u, *n, chunk_len),
93            PType::F16 => Ok(random_primitive::<u16>(u, *n, chunk_len)?
94                .to_primitive()
95                .reinterpret_cast(PType::F16)
96                .into_array()),
97            PType::F32 => random_primitive::<f32>(u, *n, chunk_len),
98            PType::F64 => random_primitive::<f64>(u, *n, chunk_len),
99        },
100        DType::Decimal(decimal, n) => {
101            let elem_len = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
102            match_each_decimal_value_type!(smallest_storage_type(decimal), |DVT| {
103                let mut builder =
104                    DecimalBuilder::new::<DVT>(decimal.precision(), decimal.scale(), *n);
105                for _i in 0..elem_len {
106                    let random_decimal = random_scalar(u, &DType::Decimal(*decimal, *n))?;
107                    builder.append_scalar(&random_decimal).vortex_expect(
108                        "was somehow unable to append a decimal to a decimal builder",
109                    );
110                }
111                Ok(builder.finish())
112            })
113        }
114        DType::Utf8(n) => random_string(u, *n, chunk_len),
115        DType::Binary(n) => random_bytes(u, *n, chunk_len),
116        DType::Struct(sdt, n) => {
117            let first_array = sdt
118                .fields()
119                .next()
120                .map(|d| random_array(u, &d, chunk_len))
121                .transpose()?;
122            let resolved_len = first_array
123                .as_ref()
124                .map(|a| a.len())
125                .or(chunk_len)
126                .map(Ok)
127                .unwrap_or_else(|| u.int_in_range(0..=100))?;
128            let children = first_array
129                .into_iter()
130                .map(Ok)
131                .chain(
132                    sdt.fields()
133                        .skip(1)
134                        .map(|d| random_array(u, &d, Some(resolved_len))),
135                )
136                .collect::<Result<Vec<_>>>()?;
137            Ok(StructArray::try_new(
138                sdt.names().clone(),
139                children,
140                resolved_len,
141                random_validity(u, *n, resolved_len)?,
142            )
143            .vortex_unwrap()
144            .into_array())
145        }
146        DType::List(elem_dtype, null) => random_list(u, elem_dtype, *null, chunk_len),
147        DType::FixedSizeList(elem_dtype, list_size, null) => {
148            random_fixed_size_list(u, elem_dtype, *list_size, *null, chunk_len)
149        }
150        DType::Extension(..) => {
151            todo!("Extension arrays are not implemented")
152        }
153    }
154}
155
156/// Creates a random fixed-size list array.
157///
158/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
159fn random_fixed_size_list(
160    u: &mut Unstructured,
161    elem_dtype: &Arc<DType>,
162    list_size: u32,
163    null: Nullability,
164    chunk_len: Option<usize>,
165) -> Result<ArrayRef> {
166    let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
167
168    let mut builder =
169        FixedSizeListBuilder::with_capacity(elem_dtype.clone(), list_size, null, array_length);
170
171    for _ in 0..array_length {
172        if null == Nullability::Nullable && u.arbitrary::<bool>()? {
173            builder.append_null();
174        } else {
175            builder
176                .append_value(random_list_scalar(u, elem_dtype, list_size, null)?.as_list())
177                .vortex_expect("can append value");
178        }
179    }
180
181    Ok(builder.finish())
182}
183
184/// Creates a random list array.
185///
186/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
187fn random_list(
188    u: &mut Unstructured,
189    elem_dtype: &Arc<DType>,
190    null: Nullability,
191    chunk_len: Option<usize>,
192) -> Result<ArrayRef> {
193    match u.int_in_range(0..=5)? {
194        0 => random_list_with_offset_type::<i16>(u, elem_dtype, null, chunk_len),
195        1 => random_list_with_offset_type::<i32>(u, elem_dtype, null, chunk_len),
196        2 => random_list_with_offset_type::<i64>(u, elem_dtype, null, chunk_len),
197        3 => random_list_with_offset_type::<u16>(u, elem_dtype, null, chunk_len),
198        4 => random_list_with_offset_type::<u32>(u, elem_dtype, null, chunk_len),
199        5 => random_list_with_offset_type::<u64>(u, elem_dtype, null, chunk_len),
200        _ => unreachable!("int_in_range returns a value in the above range"),
201    }
202}
203
204/// Creates a random list array with the given [`OffsetPType`] for the internal offsets child.
205///
206/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
207fn random_list_with_offset_type<O: OffsetPType>(
208    u: &mut Unstructured,
209    elem_dtype: &Arc<DType>,
210    null: Nullability,
211    chunk_len: Option<usize>,
212) -> Result<ArrayRef> {
213    let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
214
215    let mut builder = ListBuilder::<O>::with_capacity(elem_dtype.clone(), null, 10);
216
217    for _ in 0..array_length {
218        if null == Nullability::Nullable && u.arbitrary::<bool>()? {
219            builder.append_null();
220        } else {
221            let list_size = u.int_in_range(0..=20)?;
222            builder
223                .append_value(random_list_scalar(u, elem_dtype, list_size, null)?.as_list())
224                .vortex_expect("can append value");
225        }
226    }
227
228    Ok(builder.finish())
229}
230
231/// Creates a random list scalar with the specified list size.
232fn random_list_scalar(
233    u: &mut Unstructured,
234    elem_dtype: &Arc<DType>,
235    list_size: u32,
236    null: Nullability,
237) -> Result<Scalar> {
238    let elems = (0..list_size)
239        .map(|_| random_scalar(u, elem_dtype))
240        .collect::<Result<Vec<_>>>()?;
241    Ok(Scalar::list(elem_dtype.clone(), elems, null))
242}
243
244fn random_string(
245    u: &mut Unstructured,
246    nullability: Nullability,
247    len: Option<usize>,
248) -> Result<ArrayRef> {
249    match nullability {
250        Nullability::NonNullable => {
251            let v = arbitrary_vec_of_len::<String>(u, len)?;
252            Ok(match u.int_in_range(0..=1)? {
253                0 => VarBinArray::from_vec(v, DType::Utf8(Nullability::NonNullable)).into_array(),
254                1 => VarBinViewArray::from_iter_str(v).into_array(),
255                _ => unreachable!(),
256            })
257        }
258        Nullability::Nullable => {
259            let v = arbitrary_vec_of_len::<Option<String>>(u, len)?;
260            Ok(match u.int_in_range(0..=1)? {
261                0 => VarBinArray::from_iter(v, DType::Utf8(Nullability::Nullable)).into_array(),
262                1 => VarBinViewArray::from_iter_nullable_str(v).into_array(),
263                _ => unreachable!(),
264            })
265        }
266    }
267}
268
269fn random_bytes(
270    u: &mut Unstructured,
271    nullability: Nullability,
272    len: Option<usize>,
273) -> Result<ArrayRef> {
274    match nullability {
275        Nullability::NonNullable => {
276            let v = arbitrary_vec_of_len::<Vec<u8>>(u, len)?;
277            Ok(match u.int_in_range(0..=1)? {
278                0 => VarBinArray::from_vec(v, DType::Binary(Nullability::NonNullable)).into_array(),
279                1 => VarBinViewArray::from_iter_bin(v).into_array(),
280                _ => unreachable!(),
281            })
282        }
283        Nullability::Nullable => {
284            let v = arbitrary_vec_of_len::<Option<Vec<u8>>>(u, len)?;
285            Ok(match u.int_in_range(0..=1)? {
286                0 => VarBinArray::from_iter(v, DType::Binary(Nullability::Nullable)).into_array(),
287                1 => VarBinViewArray::from_iter_nullable_bin(v).into_array(),
288                _ => unreachable!(),
289            })
290        }
291    }
292}
293
294fn random_primitive<'a, T: Arbitrary<'a> + NativePType>(
295    u: &mut Unstructured<'a>,
296    nullability: Nullability,
297    len: Option<usize>,
298) -> Result<ArrayRef> {
299    let v = arbitrary_vec_of_len::<T>(u, len)?;
300    let validity = random_validity(u, nullability, v.len())?;
301    Ok(PrimitiveArray::new(Buffer::copy_from(v), validity).into_array())
302}
303
304fn random_bool(
305    u: &mut Unstructured,
306    nullability: Nullability,
307    len: Option<usize>,
308) -> Result<ArrayRef> {
309    let v = arbitrary_vec_of_len(u, len)?;
310    let validity = random_validity(u, nullability, v.len())?;
311    Ok(BoolArray::from_bool_buffer(BooleanBuffer::from(v), validity).into_array())
312}
313
314fn random_validity(u: &mut Unstructured, nullability: Nullability, len: usize) -> Result<Validity> {
315    match nullability {
316        Nullability::NonNullable => Ok(Validity::NonNullable),
317        Nullability::Nullable => Ok(match u.int_in_range(0..=2)? {
318            0 => Validity::AllValid,
319            1 => Validity::AllInvalid,
320            2 => Validity::from_iter(arbitrary_vec_of_len::<bool>(u, Some(len))?),
321            _ => unreachable!(),
322        }),
323    }
324}
325
326fn arbitrary_vec_of_len<'a, T: Arbitrary<'a>>(
327    u: &mut Unstructured<'a>,
328    len: Option<usize>,
329) -> Result<Vec<T>> {
330    len.map(|l| (0..l).map(|_| T::arbitrary(u)).collect::<Result<Vec<_>>>())
331        .unwrap_or_else(|| Vec::<T>::arbitrary(u))
332}