Skip to main content

vortex_array/arrays/
arbitrary.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::iter;
5use std::ops::RangeInclusive;
6use std::sync::Arc;
7
8use arbitrary::Arbitrary;
9use arbitrary::Error::IncorrectFormat;
10use arbitrary::Result;
11use arbitrary::Unstructured;
12use vortex_buffer::BitBuffer;
13use vortex_buffer::Buffer;
14use vortex_error::VortexExpect;
15
16use crate::ArrayRef;
17use crate::IntoArray;
18#[expect(deprecated)]
19use crate::ToCanonical as _;
20use crate::arrays::BoolArray;
21use crate::arrays::ChunkedArray;
22use crate::arrays::NullArray;
23use crate::arrays::PrimitiveArray;
24use crate::arrays::StructArray;
25use crate::arrays::VarBinArray;
26use crate::arrays::VarBinViewArray;
27use crate::arrays::primitive::PrimitiveArrayExt;
28use crate::builders::ArrayBuilder;
29use crate::builders::DecimalBuilder;
30use crate::builders::FixedSizeListBuilder;
31use crate::builders::ListViewBuilder;
32use crate::dtype::DType;
33use crate::dtype::IntegerPType;
34use crate::dtype::NativePType;
35use crate::dtype::Nullability;
36use crate::dtype::PType;
37use crate::match_each_decimal_value_type;
38use crate::scalar::Scalar;
39use crate::scalar::arbitrary::random_scalar;
40use crate::validity::Validity;
41
42/// A wrapper type to implement `Arbitrary` for `ArrayRef`.
43#[derive(Clone, Debug)]
44pub struct ArbitraryArray(pub ArrayRef);
45
46/// Trait for generating arbitrary values with a caller-provided configuration.
47pub trait ArbitraryWith<'a, C>: Sized {
48    /// Generate an arbitrary value using the provided configuration.
49    fn arbitrary_with_config(u: &mut Unstructured<'a>, config: &C) -> Result<Self>;
50}
51
52/// Configuration for arbitrary array generation.
53#[derive(Clone, Debug)]
54pub struct ArbitraryArrayConfig {
55    /// Fixed dtype, or `None` to generate one from [`Unstructured`].
56    pub dtype: Option<DType>,
57    /// Inclusive range for the total array length.
58    pub len: RangeInclusive<usize>,
59}
60
61impl<'a> ArbitraryWith<'a, ArbitraryArrayConfig> for ArbitraryArray {
62    fn arbitrary_with_config(
63        u: &mut Unstructured<'a>,
64        config: &ArbitraryArrayConfig,
65    ) -> Result<Self> {
66        if config.len.is_empty() {
67            return Err(IncorrectFormat);
68        }
69
70        let dtype = match &config.dtype {
71            Some(dtype) => dtype.clone(),
72            None => u.arbitrary()?,
73        };
74        let len = u.int_in_range(config.len.clone())?;
75
76        random_array(u, &dtype, Some(len)).map(ArbitraryArray)
77    }
78}
79
80fn split_number_into_parts(n: usize, parts: usize) -> Vec<usize> {
81    let reminder = n % parts;
82    let division = (n - reminder) / parts;
83    iter::repeat_n(division, parts - reminder)
84        .chain(iter::repeat_n(division + 1, reminder))
85        .collect()
86}
87
88/// Creates a random array with a random number of chunks.
89fn random_array(u: &mut Unstructured, dtype: &DType, len: Option<usize>) -> Result<ArrayRef> {
90    let num_chunks = u.int_in_range(1..=3)?;
91    let chunk_lens = len.map(|l| split_number_into_parts(l, num_chunks));
92    let mut chunks = (0..num_chunks)
93        .map(|i| {
94            let chunk_len = chunk_lens.as_ref().map(|c| c[i]);
95            random_array_chunk(u, dtype, chunk_len)
96        })
97        .collect::<Result<Vec<_>>>()?;
98
99    if chunks.len() == 1 {
100        Ok(chunks.remove(0))
101    } else {
102        let dtype = chunks[0].dtype().clone();
103        Ok(ChunkedArray::try_new(chunks, dtype)
104            .vortex_expect("operation should succeed in arbitrary impl")
105            .into_array())
106    }
107}
108
109/// Creates a random array chunk.
110fn random_array_chunk(
111    u: &mut Unstructured<'_>,
112    dtype: &DType,
113    chunk_len: Option<usize>,
114) -> Result<ArrayRef> {
115    match dtype {
116        DType::Null => Ok(NullArray::new(
117            chunk_len
118                .map(Ok)
119                .unwrap_or_else(|| u.int_in_range(0..=100))?,
120        )
121        .into_array()),
122        DType::Bool(n) => random_bool(u, *n, chunk_len),
123        DType::Primitive(ptype, n) => match ptype {
124            PType::U8 => random_primitive::<u8>(u, *n, chunk_len),
125            PType::U16 => random_primitive::<u16>(u, *n, chunk_len),
126            PType::U32 => random_primitive::<u32>(u, *n, chunk_len),
127            PType::U64 => random_primitive::<u64>(u, *n, chunk_len),
128            PType::I8 => random_primitive::<i8>(u, *n, chunk_len),
129            PType::I16 => random_primitive::<i16>(u, *n, chunk_len),
130            PType::I32 => random_primitive::<i32>(u, *n, chunk_len),
131            PType::I64 => random_primitive::<i64>(u, *n, chunk_len),
132            PType::F16 => {
133                #[expect(deprecated)]
134                let prim = random_primitive::<u16>(u, *n, chunk_len)?
135                    .to_primitive()
136                    .reinterpret_cast(PType::F16)
137                    .into_array();
138                Ok(prim)
139            }
140            PType::F32 => random_primitive::<f32>(u, *n, chunk_len),
141            PType::F64 => random_primitive::<f64>(u, *n, chunk_len),
142        },
143        d @ DType::Decimal(decimal, n) => {
144            let elem_len = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
145            match_each_decimal_value_type!(DecimalType::smallest_decimal_value_type(decimal), |D| {
146                let mut builder = DecimalBuilder::new::<D>(*decimal, *n);
147                for _i in 0..elem_len {
148                    let random_decimal = random_scalar(u, d)?;
149                    builder.append_scalar(&random_decimal).vortex_expect(
150                        "was somehow unable to append a decimal to a decimal builder",
151                    );
152                }
153                Ok(builder.finish())
154            })
155        }
156        DType::Utf8(n) => random_string(u, *n, chunk_len),
157        DType::Binary(n) => random_bytes(u, *n, chunk_len),
158        DType::Struct(sdt, n) => {
159            let first_array = sdt
160                .fields()
161                .next()
162                .map(|d| random_array(u, &d, chunk_len))
163                .transpose()?;
164            let resolved_len = first_array
165                .as_ref()
166                .map(|a| a.len())
167                .or(chunk_len)
168                .map(Ok)
169                .unwrap_or_else(|| u.int_in_range(0..=100))?;
170            let children = first_array
171                .into_iter()
172                .map(Ok)
173                .chain(
174                    sdt.fields()
175                        .skip(1)
176                        .map(|d| random_array(u, &d, Some(resolved_len))),
177                )
178                .collect::<Result<Vec<_>>>()?;
179            Ok(StructArray::try_new(
180                sdt.names().clone(),
181                children,
182                resolved_len,
183                random_validity(u, *n, resolved_len)?,
184            )
185            .vortex_expect("operation should succeed in arbitrary impl")
186            .into_array())
187        }
188        DType::List(elem_dtype, null) => random_list(u, elem_dtype, *null, chunk_len),
189        DType::FixedSizeList(elem_dtype, list_size, null) => {
190            random_fixed_size_list(u, elem_dtype, *list_size, *null, chunk_len)
191        }
192        DType::Extension(..) => {
193            unimplemented!("Extension arrays are not implemented")
194        }
195        DType::Variant(_) => {
196            unimplemented!("Variant arrays are not implemented")
197        }
198    }
199}
200
201/// Creates a random fixed-size list array.
202///
203/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
204fn random_fixed_size_list(
205    u: &mut Unstructured,
206    elem_dtype: &Arc<DType>,
207    list_size: u32,
208    null: Nullability,
209    chunk_len: Option<usize>,
210) -> Result<ArrayRef> {
211    let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
212
213    let mut builder =
214        FixedSizeListBuilder::with_capacity(Arc::clone(elem_dtype), list_size, null, array_length);
215
216    for _ in 0..array_length {
217        if null == Nullability::Nullable && u.arbitrary::<bool>()? {
218            builder.append_null();
219        } else {
220            builder
221                .append_value(random_list_scalar(u, elem_dtype, list_size, null)?.as_list())
222                .vortex_expect("can append value");
223        }
224    }
225
226    Ok(builder.finish())
227}
228
229/// Creates a random list array.
230///
231/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
232fn random_list(
233    u: &mut Unstructured,
234    elem_dtype: &Arc<DType>,
235    null: Nullability,
236    chunk_len: Option<usize>,
237) -> Result<ArrayRef> {
238    match u.int_in_range(0..=5)? {
239        0 => random_list_with_offset_type::<i16>(u, elem_dtype, null, chunk_len),
240        1 => random_list_with_offset_type::<i32>(u, elem_dtype, null, chunk_len),
241        2 => random_list_with_offset_type::<i64>(u, elem_dtype, null, chunk_len),
242        3 => random_list_with_offset_type::<u16>(u, elem_dtype, null, chunk_len),
243        4 => random_list_with_offset_type::<u32>(u, elem_dtype, null, chunk_len),
244        5 => random_list_with_offset_type::<u64>(u, elem_dtype, null, chunk_len),
245        _ => unreachable!("int_in_range returns a value in the above range"),
246    }
247}
248
249/// Creates a random list array with the given [`IntegerPType`] for the internal offsets child.
250///
251/// If the `chunk_len` is specified, the length of the array will be equal to the chunk length.
252fn random_list_with_offset_type<O: IntegerPType>(
253    u: &mut Unstructured,
254    elem_dtype: &Arc<DType>,
255    null: Nullability,
256    chunk_len: Option<usize>,
257) -> Result<ArrayRef> {
258    let array_length = chunk_len.unwrap_or(u.int_in_range(0..=20)?);
259
260    let mut builder = ListViewBuilder::<O, O>::with_capacity(Arc::clone(elem_dtype), null, 20, 10);
261
262    for _ in 0..array_length {
263        if null == Nullability::Nullable && u.arbitrary::<bool>()? {
264            builder.append_null();
265        } else {
266            let list_size = u.int_in_range(0..=20)?;
267            builder
268                .append_value(random_list_scalar(u, elem_dtype, list_size, null)?.as_list())
269                .vortex_expect("can append value");
270        }
271    }
272
273    Ok(builder.finish())
274}
275
276/// Creates a random list scalar with the specified list size.
277fn random_list_scalar(
278    u: &mut Unstructured,
279    elem_dtype: &Arc<DType>,
280    list_size: u32,
281    null: Nullability,
282) -> Result<Scalar> {
283    let elems = (0..list_size)
284        .map(|_| random_scalar(u, elem_dtype))
285        .collect::<Result<Vec<_>>>()?;
286    Ok(Scalar::list(Arc::clone(elem_dtype), elems, null))
287}
288
289fn random_string(
290    u: &mut Unstructured,
291    nullability: Nullability,
292    len: Option<usize>,
293) -> Result<ArrayRef> {
294    match nullability {
295        Nullability::NonNullable => {
296            let v = arbitrary_vec_of_len::<String>(u, len)?;
297            Ok(match u.int_in_range(0..=1)? {
298                0 => VarBinArray::from_vec(v, DType::Utf8(Nullability::NonNullable)).into_array(),
299                1 => VarBinViewArray::from_iter_str(v).into_array(),
300                _ => unreachable!(),
301            })
302        }
303        Nullability::Nullable => {
304            let v = arbitrary_vec_of_len::<Option<String>>(u, len)?;
305            Ok(match u.int_in_range(0..=1)? {
306                0 => VarBinArray::from_iter(v, DType::Utf8(Nullability::Nullable)).into_array(),
307                1 => VarBinViewArray::from_iter_nullable_str(v).into_array(),
308                _ => unreachable!(),
309            })
310        }
311    }
312}
313
314fn random_bytes(
315    u: &mut Unstructured,
316    nullability: Nullability,
317    len: Option<usize>,
318) -> Result<ArrayRef> {
319    match nullability {
320        Nullability::NonNullable => {
321            let v = arbitrary_vec_of_len::<Vec<u8>>(u, len)?;
322            Ok(match u.int_in_range(0..=1)? {
323                0 => VarBinArray::from_vec(v, DType::Binary(Nullability::NonNullable)).into_array(),
324                1 => VarBinViewArray::from_iter_bin(v).into_array(),
325                _ => unreachable!(),
326            })
327        }
328        Nullability::Nullable => {
329            let v = arbitrary_vec_of_len::<Option<Vec<u8>>>(u, len)?;
330            Ok(match u.int_in_range(0..=1)? {
331                0 => VarBinArray::from_iter(v, DType::Binary(Nullability::Nullable)).into_array(),
332                1 => VarBinViewArray::from_iter_nullable_bin(v).into_array(),
333                _ => unreachable!(),
334            })
335        }
336    }
337}
338
339fn random_primitive<'a, T: Arbitrary<'a> + NativePType>(
340    u: &mut Unstructured<'a>,
341    nullability: Nullability,
342    len: Option<usize>,
343) -> Result<ArrayRef> {
344    let v = arbitrary_vec_of_len::<T>(u, len)?;
345    let validity = random_validity(u, nullability, v.len())?;
346    Ok(PrimitiveArray::new(Buffer::copy_from(v), validity).into_array())
347}
348
349fn random_bool(
350    u: &mut Unstructured,
351    nullability: Nullability,
352    len: Option<usize>,
353) -> Result<ArrayRef> {
354    let v = arbitrary_vec_of_len(u, len)?;
355    let validity = random_validity(u, nullability, v.len())?;
356    Ok(BoolArray::new(BitBuffer::from(v), validity).into_array())
357}
358
359pub fn random_validity(
360    u: &mut Unstructured,
361    nullability: Nullability,
362    len: usize,
363) -> Result<Validity> {
364    match nullability {
365        Nullability::NonNullable => Ok(Validity::NonNullable),
366        Nullability::Nullable => Ok(match u.int_in_range(0..=2)? {
367            0 => Validity::AllValid,
368            1 => Validity::AllInvalid,
369            2 => Validity::from_iter(arbitrary_vec_of_len::<bool>(u, Some(len))?),
370            _ => unreachable!(),
371        }),
372    }
373}
374
375fn arbitrary_vec_of_len<'a, T: Arbitrary<'a>>(
376    u: &mut Unstructured<'a>,
377    len: Option<usize>,
378) -> Result<Vec<T>> {
379    len.map(|l| (0..l).map(|_| T::arbitrary(u)).collect::<Result<Vec<_>>>())
380        .unwrap_or_else(|| Vec::<T>::arbitrary(u))
381}