lance_datagen/
generator.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{collections::HashMap, iter, marker::PhantomData, sync::Arc};
5
6use arrow::{
7    array::{ArrayData, AsArray, Float32Builder, GenericBinaryBuilder, GenericStringBuilder},
8    buffer::{BooleanBuffer, Buffer, OffsetBuffer, ScalarBuffer},
9    datatypes::{
10        ArrowPrimitiveType, Float32Type, Int32Type, Int64Type, IntervalDayTime,
11        IntervalMonthDayNano, UInt32Type,
12    },
13};
14use arrow_array::{
15    make_array,
16    types::{ArrowDictionaryKeyType, BinaryType, ByteArrayType, Utf8Type},
17    Array, BinaryArray, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, LargeListArray,
18    LargeStringArray, ListArray, NullArray, OffsetSizeTrait, PrimitiveArray, RecordBatch,
19    RecordBatchOptions, RecordBatchReader, StringArray, StructArray,
20};
21use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef};
22use futures::{stream::BoxStream, StreamExt};
23use rand::{distr::Uniform, Rng, RngCore, SeedableRng};
24use random_word;
25
26use self::array::rand_with_distribution;
27
28#[derive(Copy, Clone, Debug, Default)]
29pub struct RowCount(u64);
30#[derive(Copy, Clone, Debug, Default)]
31pub struct BatchCount(u32);
32#[derive(Copy, Clone, Debug, Default)]
33pub struct ByteCount(u64);
34#[derive(Copy, Clone, Debug, Default)]
35pub struct Dimension(u32);
36
37impl From<u32> for BatchCount {
38    fn from(n: u32) -> Self {
39        Self(n)
40    }
41}
42
43impl From<u64> for RowCount {
44    fn from(n: u64) -> Self {
45        Self(n)
46    }
47}
48
49impl From<u64> for ByteCount {
50    fn from(n: u64) -> Self {
51        Self(n)
52    }
53}
54
55impl From<u32> for Dimension {
56    fn from(n: u32) -> Self {
57        Self(n)
58    }
59}
60
61/// A trait for anything that can generate arrays of data
62pub trait ArrayGenerator: Send + Sync + std::fmt::Debug {
63    /// Generate an array of the given length
64    ///
65    /// # Arguments
66    ///
67    /// * `length` - The number of elements to generate
68    /// * `rng` - The random number generator to use
69    ///
70    /// # Returns
71    ///
72    /// An array of the given length
73    ///
74    /// Note: Not every generator needs an rng.  However, it is passed here because many do and this
75    /// lets us manage RNGs at the batch level instead of the array level.
76    fn generate(
77        &mut self,
78        length: RowCount,
79        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
80    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError>;
81
82    /// Generate an array of the given length using a new RNG with the default seed
83    ///
84    /// # Arguments
85    ///
86    /// * `length` - The number of elements to generate
87    ///
88    /// # Returns
89    ///
90    /// An array of the given length
91    fn generate_default(
92        &mut self,
93        length: RowCount,
94    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
95        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
96        Self::generate(self, length, &mut rng)
97    }
98    /// Get the data type of the array that this generator produces
99    ///
100    /// # Returns
101    ///
102    /// The data type of the array that this generator produces
103    fn data_type(&self) -> &DataType;
104    /// Gets metadata that should be associated with the field generated by this generator
105    fn metadata(&self) -> Option<HashMap<String, String>> {
106        None
107    }
108    /// Get the size of each element in bytes
109    ///
110    /// # Returns
111    ///
112    /// The size of each element in bytes.  Will be None if the size varies by element.
113    fn element_size_bytes(&self) -> Option<ByteCount>;
114}
115
116#[derive(Debug)]
117pub struct CycleNullGenerator {
118    generator: Box<dyn ArrayGenerator>,
119    validity: Vec<bool>,
120    idx: usize,
121}
122#[derive(Debug)]
123pub struct CycleNanGenerator {
124    generator: Box<dyn ArrayGenerator>,
125    nan_pattern: Vec<bool>,
126    idx: usize,
127}
128
129impl ArrayGenerator for CycleNanGenerator {
130    fn generate(
131        &mut self,
132        length: RowCount,
133        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
134    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
135        let array = self.generator.generate(length, rng)?;
136
137        // Only apply NaN pattern to float types
138        match array.data_type() {
139            DataType::Float16 => {
140                let float_array = array
141                    .as_any()
142                    .downcast_ref::<arrow_array::Float16Array>()
143                    .unwrap();
144                let mut values: Vec<half::f16> = float_array.values().to_vec();
145
146                for (i, &should_be_nan) in self
147                    .nan_pattern
148                    .iter()
149                    .cycle()
150                    .skip(self.idx)
151                    .take(length.0 as usize)
152                    .enumerate()
153                {
154                    if should_be_nan {
155                        values[i] = half::f16::NAN;
156                    }
157                }
158
159                self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
160                Ok(Arc::new(arrow_array::Float16Array::from(values)))
161            }
162            DataType::Float32 => {
163                let float_array = array
164                    .as_any()
165                    .downcast_ref::<arrow_array::Float32Array>()
166                    .unwrap();
167                let mut values: Vec<f32> = float_array.values().to_vec();
168
169                for (i, &should_be_nan) in self
170                    .nan_pattern
171                    .iter()
172                    .cycle()
173                    .skip(self.idx)
174                    .take(length.0 as usize)
175                    .enumerate()
176                {
177                    if should_be_nan {
178                        values[i] = f32::NAN;
179                    }
180                }
181
182                self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
183                Ok(Arc::new(arrow_array::Float32Array::from(values)))
184            }
185            DataType::Float64 => {
186                let float_array = array
187                    .as_any()
188                    .downcast_ref::<arrow_array::Float64Array>()
189                    .unwrap();
190                let mut values: Vec<f64> = float_array.values().to_vec();
191
192                for (i, &should_be_nan) in self
193                    .nan_pattern
194                    .iter()
195                    .cycle()
196                    .skip(self.idx)
197                    .take(length.0 as usize)
198                    .enumerate()
199                {
200                    if should_be_nan {
201                        values[i] = f64::NAN;
202                    }
203                }
204
205                self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
206                Ok(Arc::new(arrow_array::Float64Array::from(values)))
207            }
208            _ => {
209                // For non-float types, just return the original array unchanged
210                Ok(array)
211            }
212        }
213    }
214
215    fn data_type(&self) -> &DataType {
216        self.generator.data_type()
217    }
218
219    fn element_size_bytes(&self) -> Option<ByteCount> {
220        self.generator.element_size_bytes()
221    }
222}
223
224impl ArrayGenerator for CycleNullGenerator {
225    fn generate(
226        &mut self,
227        length: RowCount,
228        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
229    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
230        let array = self.generator.generate(length, rng)?;
231        let data = array.to_data();
232        let validity_itr = self
233            .validity
234            .iter()
235            .cycle()
236            .skip(self.idx)
237            .take(length.0 as usize)
238            .copied();
239        let validity_bitmap = BooleanBuffer::from_iter(validity_itr);
240
241        self.idx = (self.idx + (length.0 as usize)) % self.validity.len();
242        unsafe {
243            let new_data = ArrayData::new_unchecked(
244                data.data_type().clone(),
245                data.len(),
246                None,
247                Some(validity_bitmap.into_inner()),
248                data.offset(),
249                data.buffers().to_vec(),
250                data.child_data().into(),
251            );
252            Ok(make_array(new_data))
253        }
254    }
255
256    fn data_type(&self) -> &DataType {
257        self.generator.data_type()
258    }
259
260    fn element_size_bytes(&self) -> Option<ByteCount> {
261        self.generator.element_size_bytes()
262    }
263}
264
265#[derive(Debug)]
266pub struct MetadataGenerator {
267    generator: Box<dyn ArrayGenerator>,
268    metadata: HashMap<String, String>,
269}
270
271impl ArrayGenerator for MetadataGenerator {
272    fn generate(
273        &mut self,
274        length: RowCount,
275        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
276    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
277        self.generator.generate(length, rng)
278    }
279
280    fn metadata(&self) -> Option<HashMap<String, String>> {
281        Some(self.metadata.clone())
282    }
283
284    fn data_type(&self) -> &DataType {
285        self.generator.data_type()
286    }
287
288    fn element_size_bytes(&self) -> Option<ByteCount> {
289        self.generator.element_size_bytes()
290    }
291}
292
293#[derive(Debug)]
294pub struct NullGenerator {
295    generator: Box<dyn ArrayGenerator>,
296    null_probability: f64,
297}
298
299impl ArrayGenerator for NullGenerator {
300    fn generate(
301        &mut self,
302        length: RowCount,
303        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
304    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
305        let array = self.generator.generate(length, rng)?;
306        let data = array.to_data();
307
308        if self.null_probability < 0.0 || self.null_probability > 1.0 {
309            return Err(ArrowError::InvalidArgumentError(format!(
310                "null_probability must be between 0 and 1, got {}",
311                self.null_probability
312            )));
313        }
314
315        let (null_count, new_validity) = if self.null_probability == 0.0 {
316            if data.null_count() == 0 {
317                return Ok(array);
318            } else {
319                (0_usize, None)
320            }
321        } else if self.null_probability == 1.0 {
322            if data.null_count() == data.len() {
323                return Ok(array);
324            } else {
325                let all_nulls = BooleanBuffer::new_unset(array.len());
326                (array.len(), Some(all_nulls.into_inner()))
327            }
328        } else {
329            let array_len = array.len();
330            let num_validity_bytes = array_len.div_ceil(8);
331            let mut null_count = 0;
332            // Sampling the RNG once per bit is kind of slow so we do this to sample once
333            // per byte.  We only get 8 bits of RNG resolution but that should be good enough.
334            let threshold = (self.null_probability * u8::MAX as f64) as u8;
335            let bytes = (0..num_validity_bytes)
336                .map(|byte_idx| {
337                    let mut sample = rng.random::<u64>();
338                    let mut byte: u8 = 0;
339                    for bit_idx in 0..8 {
340                        // We could probably overshoot and fill in extra bits with random data but
341                        // this is cleaner and that would mess up the null count
342                        byte <<= 1;
343                        let pos = byte_idx * 8 + (7 - bit_idx);
344                        if pos < array_len {
345                            let sample_piece = sample & 0xFF;
346                            let is_null = (sample_piece as u8) < threshold;
347                            byte |= (!is_null) as u8;
348                            null_count += is_null as usize;
349                        }
350                        sample >>= 8;
351                    }
352                    byte
353                })
354                .collect::<Vec<_>>();
355            let new_validity = Buffer::from_iter(bytes);
356            (null_count, Some(new_validity))
357        };
358
359        unsafe {
360            let new_data = ArrayData::new_unchecked(
361                data.data_type().clone(),
362                data.len(),
363                Some(null_count),
364                new_validity,
365                data.offset(),
366                data.buffers().to_vec(),
367                data.child_data().into(),
368            );
369            Ok(make_array(new_data))
370        }
371    }
372
373    fn metadata(&self) -> Option<HashMap<String, String>> {
374        self.generator.metadata()
375    }
376
377    fn data_type(&self) -> &DataType {
378        self.generator.data_type()
379    }
380
381    fn element_size_bytes(&self) -> Option<ByteCount> {
382        self.generator.element_size_bytes()
383    }
384}
385
386pub trait ArrayGeneratorExt {
387    /// Replaces the validity bitmap of generated arrays, inserting nulls with a given probability
388    fn with_random_nulls(self, null_probability: f64) -> Box<dyn ArrayGenerator>;
389    /// Replaces the validity bitmap of generated arrays with the inverse of `nulls`, cycling if needed
390    fn with_nulls(self, nulls: &[bool]) -> Box<dyn ArrayGenerator>;
391    /// Replaces the values of generated arrays with NaN values, cycling if needed
392    ///
393    /// Will have no effect if the data type is not a floating point data type
394    fn with_nans(self, nans: &[bool]) -> Box<dyn ArrayGenerator>;
395    /// Replaces the validity bitmap of generated arrays with `validity`, cycling if needed
396    fn with_validity(self, nulls: &[bool]) -> Box<dyn ArrayGenerator>;
397    fn with_metadata(self, metadata: HashMap<String, String>) -> Box<dyn ArrayGenerator>;
398}
399
400impl ArrayGeneratorExt for Box<dyn ArrayGenerator> {
401    fn with_random_nulls(self, null_probability: f64) -> Box<dyn ArrayGenerator> {
402        Box::new(NullGenerator {
403            generator: self,
404            null_probability,
405        })
406    }
407
408    fn with_nulls(self, nulls: &[bool]) -> Box<dyn ArrayGenerator> {
409        Box::new(CycleNullGenerator {
410            generator: self,
411            validity: nulls.iter().map(|v| !*v).collect(),
412            idx: 0,
413        })
414    }
415
416    fn with_nans(self, nans: &[bool]) -> Box<dyn ArrayGenerator> {
417        Box::new(CycleNanGenerator {
418            generator: self,
419            nan_pattern: nans.to_vec(),
420            idx: 0,
421        })
422    }
423
424    fn with_validity(self, validity: &[bool]) -> Box<dyn ArrayGenerator> {
425        Box::new(CycleNullGenerator {
426            generator: self,
427            validity: validity.to_vec(),
428            idx: 0,
429        })
430    }
431
432    fn with_metadata(self, metadata: HashMap<String, String>) -> Box<dyn ArrayGenerator> {
433        Box::new(MetadataGenerator {
434            generator: self,
435            metadata,
436        })
437    }
438}
439
440pub struct NTimesIter<I: Iterator>
441where
442    I::Item: Copy,
443{
444    iter: I,
445    n: u32,
446    cur: I::Item,
447    count: u32,
448}
449
450// Note: if this is used then there is a performance hit as the
451// inner loop cannot experience vectorization
452//
453// TODO: maybe faster to build the vec and then repeat it into
454// the destination array?
455impl<I: Iterator> Iterator for NTimesIter<I>
456where
457    I::Item: Copy,
458{
459    type Item = I::Item;
460
461    fn next(&mut self) -> Option<Self::Item> {
462        if self.count == 0 {
463            self.count = self.n - 1;
464            self.cur = self.iter.next()?;
465        } else {
466            self.count -= 1;
467        }
468        Some(self.cur)
469    }
470
471    fn size_hint(&self) -> (usize, Option<usize>) {
472        let (lower, upper) = self.iter.size_hint();
473        let lower = lower * self.n as usize;
474        let upper = upper.map(|u| u * self.n as usize);
475        (lower, upper)
476    }
477}
478
479pub struct FnGen<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T>
480where
481    T: Copy + Default,
482    ArrayType: arrow_array::Array + From<Vec<T>>,
483{
484    data_type: DataType,
485    generator: F,
486    array_type: PhantomData<ArrayType>,
487    repeat: u32,
488    leftover: T,
489    leftover_count: u32,
490    element_size_bytes: Option<ByteCount>,
491}
492
493impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> std::fmt::Debug
494    for FnGen<T, ArrayType, F>
495where
496    T: Copy + Default,
497    ArrayType: arrow_array::Array + From<Vec<T>>,
498{
499    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
500        f.debug_struct("FnGen")
501            .field("data_type", &self.data_type)
502            .field("array_type", &self.array_type)
503            .field("repeat", &self.repeat)
504            .field("leftover_count", &self.leftover_count)
505            .field("element_size_bytes", &self.element_size_bytes)
506            .finish()
507    }
508}
509
510impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> FnGen<T, ArrayType, F>
511where
512    T: Copy + Default,
513    ArrayType: arrow_array::Array + From<Vec<T>>,
514{
515    fn new_known_size(
516        data_type: DataType,
517        generator: F,
518        repeat: u32,
519        element_size_bytes: ByteCount,
520    ) -> Self {
521        Self {
522            data_type,
523            generator,
524            array_type: PhantomData,
525            repeat,
526            leftover: T::default(),
527            leftover_count: 0,
528            element_size_bytes: Some(element_size_bytes),
529        }
530    }
531
532    fn new_unknown_size(data_type: DataType, generator: F, repeat: u32) -> Self {
533        Self {
534            data_type,
535            generator,
536            array_type: PhantomData,
537            repeat,
538            leftover: T::default(),
539            leftover_count: 0,
540            element_size_bytes: None,
541        }
542    }
543}
544
545impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> ArrayGenerator
546    for FnGen<T, ArrayType, F>
547where
548    T: Copy + Default + Send + Sync,
549    ArrayType: arrow_array::Array + From<Vec<T>> + 'static,
550    F: Send + Sync,
551{
552    fn generate(
553        &mut self,
554        length: RowCount,
555        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
556    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
557        let iter = (0..length.0).map(|_| (self.generator)(rng));
558        let values = if self.repeat > 1 {
559            Vec::from_iter(
560                NTimesIter {
561                    iter,
562                    n: self.repeat,
563                    cur: self.leftover,
564                    count: self.leftover_count,
565                }
566                .take(length.0 as usize),
567            )
568        } else {
569            Vec::from_iter(iter)
570        };
571        self.leftover_count = ((self.leftover_count as u64 + length.0) % self.repeat as u64) as u32;
572        self.leftover = values.last().copied().unwrap_or(T::default());
573        Ok(Arc::new(ArrayType::from(values)))
574    }
575
576    fn data_type(&self) -> &DataType {
577        &self.data_type
578    }
579
580    fn element_size_bytes(&self) -> Option<ByteCount> {
581        self.element_size_bytes
582    }
583}
584
585#[derive(Copy, Clone, Debug)]
586pub struct Seed(pub u64);
587pub const DEFAULT_SEED: Seed = Seed(42);
588
589impl From<u64> for Seed {
590    fn from(n: u64) -> Self {
591        Self(n)
592    }
593}
594
595#[derive(Debug)]
596pub struct CycleVectorGenerator {
597    underlying_gen: Box<dyn ArrayGenerator>,
598    dimension: Dimension,
599    data_type: DataType,
600}
601
602impl CycleVectorGenerator {
603    pub fn new(underlying_gen: Box<dyn ArrayGenerator>, dimension: Dimension) -> Self {
604        let data_type = DataType::FixedSizeList(
605            Arc::new(Field::new("item", underlying_gen.data_type().clone(), true)),
606            dimension.0 as i32,
607        );
608        Self {
609            underlying_gen,
610            dimension,
611            data_type,
612        }
613    }
614}
615
616impl ArrayGenerator for CycleVectorGenerator {
617    fn generate(
618        &mut self,
619        length: RowCount,
620        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
621    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
622        let values = self
623            .underlying_gen
624            .generate(RowCount::from(length.0 * self.dimension.0 as u64), rng)?;
625        let field = Arc::new(Field::new("item", values.data_type().clone(), true));
626        let values = Arc::new(values);
627
628        let array = FixedSizeListArray::try_new(field, self.dimension.0 as i32, values, None)?;
629
630        Ok(Arc::new(array))
631    }
632
633    fn data_type(&self) -> &DataType {
634        &self.data_type
635    }
636
637    fn element_size_bytes(&self) -> Option<ByteCount> {
638        self.underlying_gen
639            .element_size_bytes()
640            .map(|byte_count| ByteCount::from(byte_count.0 * self.dimension.0 as u64))
641    }
642}
643
644#[derive(Debug)]
645pub struct CycleListGenerator {
646    underlying_gen: Box<dyn ArrayGenerator>,
647    lengths_gen: Box<dyn ArrayGenerator>,
648    data_type: DataType,
649}
650
651impl CycleListGenerator {
652    pub fn new(
653        underlying_gen: Box<dyn ArrayGenerator>,
654        min_list_size: Dimension,
655        max_list_size: Dimension,
656    ) -> Self {
657        let data_type = DataType::List(Arc::new(Field::new(
658            "item",
659            underlying_gen.data_type().clone(),
660            true,
661        )));
662        let lengths_dist = Uniform::new(min_list_size.0, max_list_size.0).unwrap();
663        let lengths_gen = rand_with_distribution::<UInt32Type, Uniform<u32>>(lengths_dist);
664        Self {
665            underlying_gen,
666            lengths_gen,
667            data_type,
668        }
669    }
670}
671
672impl ArrayGenerator for CycleListGenerator {
673    fn generate(
674        &mut self,
675        length: RowCount,
676        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
677    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
678        let lengths = self.lengths_gen.generate(length, rng)?;
679        let lengths = lengths.as_primitive::<UInt32Type>();
680        let total_length = lengths.values().iter().map(|i| *i as u64).sum::<u64>();
681        let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
682        let values = self
683            .underlying_gen
684            .generate(RowCount::from(total_length), rng)?;
685        let field = Arc::new(Field::new("item", values.data_type().clone(), true));
686        let values = Arc::new(values);
687
688        let array = ListArray::try_new(field, offsets, values, None)?;
689
690        Ok(Arc::new(array))
691    }
692
693    fn data_type(&self) -> &DataType {
694        &self.data_type
695    }
696
697    fn element_size_bytes(&self) -> Option<ByteCount> {
698        None
699    }
700}
701
702#[derive(Debug, Default)]
703pub struct PseudoUuidGenerator {}
704
705impl ArrayGenerator for PseudoUuidGenerator {
706    fn generate(
707        &mut self,
708        length: RowCount,
709        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
710    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
711        Ok(Arc::new(FixedSizeBinaryArray::try_from_iter(
712            (0..length.0).map(|_| {
713                let mut data = vec![0; 16];
714                rng.fill_bytes(&mut data);
715                data
716            }),
717        )?))
718    }
719
720    fn data_type(&self) -> &DataType {
721        &DataType::FixedSizeBinary(16)
722    }
723
724    fn element_size_bytes(&self) -> Option<ByteCount> {
725        Some(ByteCount::from(16))
726    }
727}
728
729#[derive(Debug, Default)]
730pub struct PseudoUuidHexGenerator {}
731
732impl ArrayGenerator for PseudoUuidHexGenerator {
733    fn generate(
734        &mut self,
735        length: RowCount,
736        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
737    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
738        let mut data = vec![0; 16 * length.0 as usize];
739        rng.fill_bytes(&mut data);
740        let data_hex = hex::encode(data);
741
742        Ok(Arc::new(StringArray::from_iter_values(
743            (0..length.0 as usize).map(|i| data_hex.get(i * 32..(i + 1) * 32).unwrap()),
744        )))
745    }
746
747    fn data_type(&self) -> &DataType {
748        &DataType::Utf8
749    }
750
751    fn element_size_bytes(&self) -> Option<ByteCount> {
752        Some(ByteCount::from(16))
753    }
754}
755
756#[derive(Debug, Default)]
757pub struct RandomBooleanGenerator {}
758
759impl ArrayGenerator for RandomBooleanGenerator {
760    fn generate(
761        &mut self,
762        length: RowCount,
763        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
764    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
765        let num_bytes = length.0.div_ceil(8);
766        let mut bytes = vec![0; num_bytes as usize];
767        rng.fill_bytes(&mut bytes);
768        let bytes = BooleanBuffer::new(Buffer::from(bytes), 0, length.0 as usize);
769        Ok(Arc::new(arrow_array::BooleanArray::new(bytes, None)))
770    }
771
772    fn data_type(&self) -> &DataType {
773        &DataType::Boolean
774    }
775
776    fn element_size_bytes(&self) -> Option<ByteCount> {
777        // We can't say 1/8th of a byte and 1 byte would be a pretty extreme over-count so let's leave
778        // it at None until someone needs this.  Then we can probably special case this (e.g. make a ByteCount::ONE_BIT)
779        None
780    }
781}
782
783// Instead of using the "standard distribution" and generating values there are some cases (e.g. f16 / decimal)
784// where we just generate random bytes because there is no rand support
785pub struct RandomBytesGenerator<T: ArrowPrimitiveType + Send + Sync> {
786    phantom: PhantomData<T>,
787    data_type: DataType,
788}
789
790impl<T: ArrowPrimitiveType + Send + Sync> std::fmt::Debug for RandomBytesGenerator<T> {
791    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
792        f.debug_struct("RandomBytesGenerator")
793            .field("data_type", &self.data_type)
794            .finish()
795    }
796}
797
798impl<T: ArrowPrimitiveType + Send + Sync> RandomBytesGenerator<T> {
799    fn new(data_type: DataType) -> Self {
800        Self {
801            phantom: Default::default(),
802            data_type,
803        }
804    }
805
806    fn byte_width() -> Result<u64, ArrowError> {
807        T::DATA_TYPE.primitive_width().ok_or_else(|| ArrowError::InvalidArgumentError(format!("Cannot generate the data type {} with the RandomBytesGenerator because it is not a fixed-width bytes type", T::DATA_TYPE))).map(|val| val as u64)
808    }
809}
810
811impl<T: ArrowPrimitiveType + Send + Sync> ArrayGenerator for RandomBytesGenerator<T> {
812    fn generate(
813        &mut self,
814        length: RowCount,
815        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
816    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
817        let num_bytes = length.0 * Self::byte_width()?;
818        let mut bytes = vec![0; num_bytes as usize];
819        rng.fill_bytes(&mut bytes);
820        let bytes = ScalarBuffer::new(Buffer::from(bytes), 0, length.0 as usize);
821        Ok(Arc::new(
822            PrimitiveArray::<T>::new(bytes, None).with_data_type(self.data_type.clone()),
823        ))
824    }
825
826    fn data_type(&self) -> &DataType {
827        &self.data_type
828    }
829
830    fn element_size_bytes(&self) -> Option<ByteCount> {
831        Self::byte_width().map(ByteCount::from).ok()
832    }
833}
834
835// This is pretty much the same thing as RandomBinaryGenerator but we can't use that
836// because there is no ArrowPrimitiveType for FixedSizeBinary
837#[derive(Debug)]
838pub struct RandomFixedSizeBinaryGenerator {
839    data_type: DataType,
840    size: i32,
841}
842
843impl RandomFixedSizeBinaryGenerator {
844    fn new(size: i32) -> Self {
845        Self {
846            size,
847            data_type: DataType::FixedSizeBinary(size),
848        }
849    }
850}
851
852impl ArrayGenerator for RandomFixedSizeBinaryGenerator {
853    fn generate(
854        &mut self,
855        length: RowCount,
856        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
857    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
858        let num_bytes = length.0 * self.size as u64;
859        let mut bytes = vec![0; num_bytes as usize];
860        rng.fill_bytes(&mut bytes);
861        Ok(Arc::new(FixedSizeBinaryArray::new(
862            self.size,
863            Buffer::from(bytes),
864            None,
865        )))
866    }
867
868    fn data_type(&self) -> &DataType {
869        &self.data_type
870    }
871
872    fn element_size_bytes(&self) -> Option<ByteCount> {
873        Some(ByteCount::from(self.size as u64))
874    }
875}
876
877#[derive(Debug)]
878pub struct RandomIntervalGenerator {
879    unit: IntervalUnit,
880    data_type: DataType,
881}
882
883impl RandomIntervalGenerator {
884    pub fn new(unit: IntervalUnit) -> Self {
885        Self {
886            unit,
887            data_type: DataType::Interval(unit),
888        }
889    }
890}
891
892impl ArrayGenerator for RandomIntervalGenerator {
893    fn generate(
894        &mut self,
895        length: RowCount,
896        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
897    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
898        match self.unit {
899            IntervalUnit::YearMonth => {
900                let months = (0..length.0)
901                    .map(|_| rng.random::<i32>())
902                    .collect::<Vec<_>>();
903                Ok(Arc::new(arrow_array::IntervalYearMonthArray::from(months)))
904            }
905            IntervalUnit::MonthDayNano => {
906                let day_time_array = (0..length.0)
907                    .map(|_| IntervalMonthDayNano::new(rng.random(), rng.random(), rng.random()))
908                    .collect::<Vec<_>>();
909                Ok(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(
910                    day_time_array,
911                )))
912            }
913            IntervalUnit::DayTime => {
914                let day_time_array = (0..length.0)
915                    .map(|_| IntervalDayTime::new(rng.random(), rng.random()))
916                    .collect::<Vec<_>>();
917                Ok(Arc::new(arrow_array::IntervalDayTimeArray::from(
918                    day_time_array,
919                )))
920            }
921        }
922    }
923
924    fn data_type(&self) -> &DataType {
925        &self.data_type
926    }
927
928    fn element_size_bytes(&self) -> Option<ByteCount> {
929        Some(ByteCount::from(12))
930    }
931}
932#[derive(Debug)]
933pub struct RandomBinaryGenerator {
934    bytes_per_element: ByteCount,
935    scale_to_utf8: bool,
936    is_large: bool,
937    data_type: DataType,
938}
939
940impl RandomBinaryGenerator {
941    pub fn new(bytes_per_element: ByteCount, scale_to_utf8: bool, is_large: bool) -> Self {
942        Self {
943            bytes_per_element,
944            scale_to_utf8,
945            is_large,
946            data_type: match (scale_to_utf8, is_large) {
947                (false, false) => DataType::Binary,
948                (false, true) => DataType::LargeBinary,
949                (true, false) => DataType::Utf8,
950                (true, true) => DataType::LargeUtf8,
951            },
952        }
953    }
954}
955
956impl ArrayGenerator for RandomBinaryGenerator {
957    fn generate(
958        &mut self,
959        length: RowCount,
960        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
961    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
962        let mut bytes = vec![0; (self.bytes_per_element.0 * length.0) as usize];
963        rng.fill_bytes(&mut bytes);
964        if self.scale_to_utf8 {
965            // This doesn't give us the full UTF-8 range and it isn't statistically correct but
966            // it's fast and probably good enough for most cases
967            bytes = bytes.into_iter().map(|val| (val % 95) + 32).collect();
968        }
969        let bytes = Buffer::from(bytes);
970        if self.is_large {
971            let offsets = OffsetBuffer::from_lengths(iter::repeat_n(
972                self.bytes_per_element.0 as usize,
973                length.0 as usize,
974            ));
975            if self.scale_to_utf8 {
976                // This is safe because we are only using printable characters
977                unsafe {
978                    Ok(Arc::new(arrow_array::LargeStringArray::new_unchecked(
979                        offsets, bytes, None,
980                    )))
981                }
982            } else {
983                unsafe {
984                    Ok(Arc::new(arrow_array::LargeBinaryArray::new_unchecked(
985                        offsets, bytes, None,
986                    )))
987                }
988            }
989        } else {
990            let offsets = OffsetBuffer::from_lengths(iter::repeat_n(
991                self.bytes_per_element.0 as usize,
992                length.0 as usize,
993            ));
994            if self.scale_to_utf8 {
995                // This is safe because we are only using printable characters
996                unsafe {
997                    Ok(Arc::new(arrow_array::StringArray::new_unchecked(
998                        offsets, bytes, None,
999                    )))
1000                }
1001            } else {
1002                unsafe {
1003                    Ok(Arc::new(arrow_array::BinaryArray::new_unchecked(
1004                        offsets, bytes, None,
1005                    )))
1006                }
1007            }
1008        }
1009    }
1010
1011    fn data_type(&self) -> &DataType {
1012        &self.data_type
1013    }
1014
1015    fn element_size_bytes(&self) -> Option<ByteCount> {
1016        // Not exactly correct since there are N + 1 4-byte offsets and this only counts N
1017        Some(ByteCount::from(
1018            self.bytes_per_element.0 + std::mem::size_of::<i32>() as u64,
1019        ))
1020    }
1021}
1022
1023/// Generate a sequence of strings with a prefix and a counter
1024///
1025/// For example, if the prefix is "user_" the the strings will be "user_0", "user_1", ...
1026#[derive(Debug)]
1027pub struct PrefixPlusCounterGenerator {
1028    prefix: String,
1029    is_large: bool,
1030    data_type: DataType,
1031    current_counter: u64,
1032}
1033
1034impl PrefixPlusCounterGenerator {
1035    pub fn new(prefix: String, is_large: bool) -> Self {
1036        Self {
1037            prefix,
1038            is_large,
1039            data_type: if is_large {
1040                DataType::LargeUtf8
1041            } else {
1042                DataType::Utf8
1043            },
1044            current_counter: 0,
1045        }
1046    }
1047
1048    fn generate_values<T: OffsetSizeTrait>(
1049        &self,
1050        start: u64,
1051        num_values: u64,
1052    ) -> Result<Arc<dyn Array>, ArrowError> {
1053        let max_counter = start + num_values;
1054        let max_digits_per_counter = (max_counter as f64).log10().ceil() as u64;
1055        let max_bytes_per_str = max_digits_per_counter + self.prefix.len() as u64;
1056        let max_bytes = max_bytes_per_str * num_values;
1057        let mut builder =
1058            GenericStringBuilder::<T>::with_capacity(num_values as usize, max_bytes as usize);
1059        let mut word = String::with_capacity(max_bytes_per_str as usize);
1060        word.push_str(&self.prefix);
1061        for i in 0..num_values {
1062            let counter = start + i;
1063            word.truncate(self.prefix.len());
1064            word.push_str(&counter.to_string());
1065            builder.append_value(&word);
1066        }
1067        Ok(Arc::new(builder.finish()))
1068    }
1069}
1070
1071impl ArrayGenerator for PrefixPlusCounterGenerator {
1072    fn generate(
1073        &mut self,
1074        length: RowCount,
1075        _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1076    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1077        let start = self.current_counter;
1078        self.current_counter += length.0;
1079        if self.is_large {
1080            self.generate_values::<i64>(start, length.0)
1081        } else {
1082            self.generate_values::<i32>(start, length.0)
1083        }
1084    }
1085
1086    fn data_type(&self) -> &DataType {
1087        &self.data_type
1088    }
1089
1090    fn element_size_bytes(&self) -> Option<ByteCount> {
1091        // It's not consistent
1092        None
1093    }
1094}
1095
1096/// Generate a sequence of binary strings with a prefix and a counter
1097///
1098/// The counter will be encoded (little-endian) as a u8, u16, u32, or u64 and added to the prefix
1099/// As long as more than 256 values are generated then the resulting array will have
1100/// variable width
1101#[derive(Debug)]
1102pub struct BinaryPrefixPlusCounterGenerator {
1103    prefix: Arc<[u8]>,
1104    is_large: bool,
1105    data_type: DataType,
1106    current_counter: u64,
1107}
1108
1109impl BinaryPrefixPlusCounterGenerator {
1110    pub fn new(prefix: Arc<[u8]>, is_large: bool) -> Self {
1111        Self {
1112            prefix,
1113            is_large,
1114            data_type: if is_large {
1115                DataType::LargeBinary
1116            } else {
1117                DataType::Binary
1118            },
1119            current_counter: 0,
1120        }
1121    }
1122
1123    fn generate_values<T: OffsetSizeTrait>(
1124        &self,
1125        start: u64,
1126        num_values: u64,
1127    ) -> Result<Arc<dyn Array>, ArrowError> {
1128        let max_bytes = (self.prefix.len() + std::mem::size_of::<u64>()) * num_values as usize;
1129        let mut builder = GenericBinaryBuilder::<T>::with_capacity(num_values as usize, max_bytes);
1130        let mut word = Vec::with_capacity(self.prefix.len() + std::mem::size_of::<u64>());
1131        word.extend_from_slice(&self.prefix);
1132        for i in 0..num_values {
1133            let counter = start + i;
1134            word.truncate(self.prefix.len());
1135            if counter < u8::MAX as u64 {
1136                word.push(counter as u8);
1137            } else if counter < u16::MAX as u64 {
1138                word.extend_from_slice(&(counter as u16).to_le_bytes());
1139            } else if counter < u32::MAX as u64 {
1140                word.extend_from_slice(&(counter as u32).to_le_bytes());
1141            } else {
1142                word.extend_from_slice(&counter.to_le_bytes());
1143            }
1144            builder.append_value(&word);
1145        }
1146        Ok(Arc::new(builder.finish()))
1147    }
1148}
1149
1150impl ArrayGenerator for BinaryPrefixPlusCounterGenerator {
1151    fn generate(
1152        &mut self,
1153        length: RowCount,
1154        _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1155    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1156        let start = self.current_counter;
1157        self.current_counter += length.0;
1158        if self.is_large {
1159            self.generate_values::<i64>(start, length.0)
1160        } else {
1161            self.generate_values::<i32>(start, length.0)
1162        }
1163    }
1164
1165    fn data_type(&self) -> &DataType {
1166        &self.data_type
1167    }
1168
1169    fn element_size_bytes(&self) -> Option<ByteCount> {
1170        // It's not consistent
1171        None
1172    }
1173}
1174
1175#[derive(Debug)]
1176struct RandomSentenceGenerator {
1177    min_words: usize,
1178    max_words: usize,
1179    words: &'static [&'static str],
1180    is_large: bool,
1181}
1182
1183impl RandomSentenceGenerator {
1184    pub fn new(min_words: usize, max_words: usize, is_large: bool) -> Self {
1185        let words = random_word::all(random_word::Lang::En);
1186        Self {
1187            min_words,
1188            max_words,
1189            words,
1190            is_large,
1191        }
1192    }
1193}
1194
1195impl ArrayGenerator for RandomSentenceGenerator {
1196    fn generate(
1197        &mut self,
1198        length: RowCount,
1199        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1200    ) -> Result<Arc<dyn Array>, ArrowError> {
1201        let mut values = Vec::with_capacity(length.0 as usize);
1202
1203        for _ in 0..length.0 {
1204            let num_words = rng.random_range(self.min_words..=self.max_words);
1205            let sentence: String = (0..num_words)
1206                .map(|_| self.words[rng.random_range(0..self.words.len())])
1207                .collect::<Vec<_>>()
1208                .join(" ");
1209            values.push(sentence);
1210        }
1211
1212        if self.is_large {
1213            Ok(Arc::new(LargeStringArray::from(values)))
1214        } else {
1215            Ok(Arc::new(StringArray::from(values)))
1216        }
1217    }
1218
1219    fn data_type(&self) -> &DataType {
1220        if self.is_large {
1221            &DataType::LargeUtf8
1222        } else {
1223            &DataType::Utf8
1224        }
1225    }
1226
1227    fn element_size_bytes(&self) -> Option<ByteCount> {
1228        // Estimate average word length as 5, plus space
1229        // See https://arxiv.org/pdf/1208.6109
1230        let avg_word_length = 6;
1231        let avg_words = (self.min_words + self.max_words) / 2;
1232        Some(ByteCount::from((avg_word_length * avg_words) as u64))
1233    }
1234}
1235
1236#[derive(Debug)]
1237struct RandomWordGenerator {
1238    words: &'static [&'static str],
1239    is_large: bool,
1240}
1241
1242impl RandomWordGenerator {
1243    pub fn new(is_large: bool) -> Self {
1244        let words = random_word::all(random_word::Lang::En);
1245        Self { words, is_large }
1246    }
1247}
1248
1249impl ArrayGenerator for RandomWordGenerator {
1250    fn generate(
1251        &mut self,
1252        length: RowCount,
1253        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1254    ) -> Result<Arc<dyn Array>, ArrowError> {
1255        let mut values = Vec::with_capacity(length.0 as usize);
1256
1257        for _ in 0..length.0 {
1258            let word = self.words[rng.random_range(0..self.words.len())];
1259            values.push(word.to_string());
1260        }
1261
1262        if self.is_large {
1263            Ok(Arc::new(LargeStringArray::from(values)))
1264        } else {
1265            Ok(Arc::new(StringArray::from(values)))
1266        }
1267    }
1268
1269    fn data_type(&self) -> &DataType {
1270        if self.is_large {
1271            &DataType::LargeUtf8
1272        } else {
1273            &DataType::Utf8
1274        }
1275    }
1276
1277    fn element_size_bytes(&self) -> Option<ByteCount> {
1278        // Average English word length is ~5 characters
1279        Some(ByteCount::from(5))
1280    }
1281}
1282
1283#[derive(Debug)]
1284pub struct VariableRandomBinaryGenerator {
1285    lengths_gen: Box<dyn ArrayGenerator>,
1286    data_type: DataType,
1287}
1288
1289impl VariableRandomBinaryGenerator {
1290    pub fn new(min_bytes_per_element: ByteCount, max_bytes_per_element: ByteCount) -> Self {
1291        let lengths_dist = Uniform::new_inclusive(
1292            min_bytes_per_element.0 as i32,
1293            max_bytes_per_element.0 as i32,
1294        )
1295        .unwrap();
1296        let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist);
1297
1298        Self {
1299            lengths_gen,
1300            data_type: DataType::Binary,
1301        }
1302    }
1303}
1304
1305impl ArrayGenerator for VariableRandomBinaryGenerator {
1306    fn generate(
1307        &mut self,
1308        length: RowCount,
1309        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1310    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1311        let lengths = self.lengths_gen.generate(length, rng)?;
1312        let lengths = lengths.as_primitive::<Int32Type>();
1313        let total_length = lengths.values().iter().map(|i| *i as usize).sum::<usize>();
1314        let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1315        let mut bytes = vec![0; total_length];
1316        rng.fill_bytes(&mut bytes);
1317        let bytes = Buffer::from(bytes);
1318        Ok(Arc::new(BinaryArray::try_new(offsets, bytes, None)?))
1319    }
1320
1321    fn data_type(&self) -> &DataType {
1322        &self.data_type
1323    }
1324
1325    fn element_size_bytes(&self) -> Option<ByteCount> {
1326        None
1327    }
1328}
1329
1330pub struct CycleBinaryGenerator<T: ByteArrayType> {
1331    values: Vec<u8>,
1332    lengths: Vec<usize>,
1333    data_type: DataType,
1334    array_type: PhantomData<T>,
1335    width: Option<ByteCount>,
1336    idx: usize,
1337}
1338
1339impl<T: ByteArrayType> std::fmt::Debug for CycleBinaryGenerator<T> {
1340    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1341        f.debug_struct("CycleBinaryGenerator")
1342            .field("values", &self.values)
1343            .field("lengths", &self.lengths)
1344            .field("data_type", &self.data_type)
1345            .field("width", &self.width)
1346            .field("idx", &self.idx)
1347            .finish()
1348    }
1349}
1350
1351impl<T: ByteArrayType> CycleBinaryGenerator<T> {
1352    pub fn from_strings(values: &[&str]) -> Self {
1353        if values.is_empty() {
1354            panic!("Attempt to create a cycle generator with no values");
1355        }
1356        let lengths = values.iter().map(|s| s.len()).collect::<Vec<_>>();
1357        let typical_length = lengths[0];
1358        let width = if lengths.iter().all(|item| *item == typical_length) {
1359            Some(ByteCount::from(
1360                typical_length as u64 + std::mem::size_of::<i32>() as u64,
1361            ))
1362        } else {
1363            None
1364        };
1365        let values = values
1366            .iter()
1367            .flat_map(|s| s.as_bytes().iter().copied())
1368            .collect::<Vec<_>>();
1369        Self {
1370            values,
1371            lengths,
1372            data_type: T::DATA_TYPE,
1373            array_type: PhantomData,
1374            width,
1375            idx: 0,
1376        }
1377    }
1378}
1379
1380impl<T: ByteArrayType> ArrayGenerator for CycleBinaryGenerator<T> {
1381    fn generate(
1382        &mut self,
1383        length: RowCount,
1384        _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1385    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1386        let lengths = self
1387            .lengths
1388            .iter()
1389            .copied()
1390            .cycle()
1391            .skip(self.idx)
1392            .take(length.0 as usize);
1393        let num_bytes = lengths.clone().sum();
1394        let byte_offset = self.lengths[0..self.idx].iter().sum();
1395        let bytes = self
1396            .values
1397            .iter()
1398            .cycle()
1399            .skip(byte_offset)
1400            .copied()
1401            .take(num_bytes)
1402            .collect::<Vec<_>>();
1403        let bytes = Buffer::from(bytes);
1404        let offsets = OffsetBuffer::from_lengths(lengths);
1405        self.idx = (self.idx + length.0 as usize) % self.lengths.len();
1406        Ok(Arc::new(arrow_array::GenericByteArray::<T>::new(
1407            offsets, bytes, None,
1408        )))
1409    }
1410
1411    fn data_type(&self) -> &DataType {
1412        &self.data_type
1413    }
1414
1415    fn element_size_bytes(&self) -> Option<ByteCount> {
1416        self.width
1417    }
1418}
1419
1420pub struct FixedBinaryGenerator<T: ByteArrayType> {
1421    value: Vec<u8>,
1422    data_type: DataType,
1423    array_type: PhantomData<T>,
1424}
1425
1426impl<T: ByteArrayType> std::fmt::Debug for FixedBinaryGenerator<T> {
1427    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1428        f.debug_struct("FixedBinaryGenerator")
1429            .field("value", &self.value)
1430            .field("data_type", &self.data_type)
1431            .finish()
1432    }
1433}
1434
1435impl<T: ByteArrayType> FixedBinaryGenerator<T> {
1436    pub fn new(value: Vec<u8>) -> Self {
1437        Self {
1438            value,
1439            data_type: T::DATA_TYPE,
1440            array_type: PhantomData,
1441        }
1442    }
1443}
1444
1445impl<T: ByteArrayType> ArrayGenerator for FixedBinaryGenerator<T> {
1446    fn generate(
1447        &mut self,
1448        length: RowCount,
1449        _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1450    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1451        let bytes = Buffer::from(Vec::from_iter(
1452            self.value
1453                .iter()
1454                .cycle()
1455                .take((length.0 * self.value.len() as u64) as usize)
1456                .copied(),
1457        ));
1458        let offsets =
1459            OffsetBuffer::from_lengths(iter::repeat_n(self.value.len(), length.0 as usize));
1460        Ok(Arc::new(arrow_array::GenericByteArray::<T>::new(
1461            offsets, bytes, None,
1462        )))
1463    }
1464
1465    fn data_type(&self) -> &DataType {
1466        &self.data_type
1467    }
1468
1469    fn element_size_bytes(&self) -> Option<ByteCount> {
1470        // Not exactly correct since there are N + 1 4-byte offsets and this only counts N
1471        Some(ByteCount::from(
1472            self.value.len() as u64 + std::mem::size_of::<i32>() as u64,
1473        ))
1474    }
1475}
1476
1477pub struct DictionaryGenerator<K: ArrowDictionaryKeyType> {
1478    generator: Box<dyn ArrayGenerator>,
1479    data_type: DataType,
1480    key_type: PhantomData<K>,
1481    key_width: u64,
1482}
1483
1484impl<K: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryGenerator<K> {
1485    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1486        f.debug_struct("DictionaryGenerator")
1487            .field("generator", &self.generator)
1488            .field("data_type", &self.data_type)
1489            .field("key_width", &self.key_width)
1490            .finish()
1491    }
1492}
1493
1494impl<K: ArrowDictionaryKeyType> DictionaryGenerator<K> {
1495    fn new(generator: Box<dyn ArrayGenerator>) -> Self {
1496        let key_type = Box::new(K::DATA_TYPE);
1497        let key_width = key_type
1498            .primitive_width()
1499            .expect("dictionary key types should have a known width")
1500            as u64;
1501        let val_type = Box::new(generator.data_type().clone());
1502        let dict_type = DataType::Dictionary(key_type, val_type);
1503        Self {
1504            generator,
1505            data_type: dict_type,
1506            key_type: PhantomData,
1507            key_width,
1508        }
1509    }
1510}
1511
1512impl<K: ArrowDictionaryKeyType + Send + Sync> ArrayGenerator for DictionaryGenerator<K> {
1513    fn generate(
1514        &mut self,
1515        length: RowCount,
1516        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1517    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1518        let underlying = self.generator.generate(length, rng)?;
1519        arrow_cast::cast::cast(&underlying, &self.data_type)
1520    }
1521
1522    fn data_type(&self) -> &DataType {
1523        &self.data_type
1524    }
1525
1526    fn element_size_bytes(&self) -> Option<ByteCount> {
1527        self.generator
1528            .element_size_bytes()
1529            .map(|size_bytes| ByteCount::from(size_bytes.0 + self.key_width))
1530    }
1531}
1532
1533#[derive(Debug)]
1534struct RandomListGenerator {
1535    field: Arc<Field>,
1536    child_field: Arc<Field>,
1537    items_gen: Box<dyn ArrayGenerator>,
1538    lengths_gen: Box<dyn ArrayGenerator>,
1539    is_large: bool,
1540}
1541
1542impl RandomListGenerator {
1543    // Creates a list generator that generates random lists with lengths between 0 and 10 (inclusive)
1544    fn new(items_gen: Box<dyn ArrayGenerator>, is_large: bool) -> Self {
1545        let child_field = Arc::new(Field::new("item", items_gen.data_type().clone(), true));
1546        let list_type = if is_large {
1547            DataType::LargeList(child_field.clone())
1548        } else {
1549            DataType::List(child_field.clone())
1550        };
1551        let field = Field::new("", list_type, true);
1552        let lengths_gen = if is_large {
1553            let lengths_dist = Uniform::new_inclusive(0, 10).unwrap();
1554            rand_with_distribution::<Int64Type, Uniform<i64>>(lengths_dist)
1555        } else {
1556            let lengths_dist = Uniform::new_inclusive(0, 10).unwrap();
1557            rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist)
1558        };
1559        Self {
1560            field: Arc::new(field),
1561            child_field,
1562            items_gen,
1563            lengths_gen,
1564            is_large,
1565        }
1566    }
1567}
1568
1569impl ArrayGenerator for RandomListGenerator {
1570    fn generate(
1571        &mut self,
1572        length: RowCount,
1573        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1574    ) -> Result<Arc<dyn Array>, ArrowError> {
1575        let lengths = self.lengths_gen.generate(length, rng)?;
1576        if self.is_large {
1577            let lengths = lengths.as_primitive::<Int64Type>();
1578            let total_length = lengths.values().iter().sum::<i64>() as u64;
1579            let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1580            let items = self.items_gen.generate(RowCount::from(total_length), rng)?;
1581            Ok(Arc::new(LargeListArray::try_new(
1582                self.child_field.clone(),
1583                offsets,
1584                items,
1585                None,
1586            )?))
1587        } else {
1588            let lengths = lengths.as_primitive::<Int32Type>();
1589            let total_length = lengths.values().iter().sum::<i32>() as u64;
1590            let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1591            let items = self.items_gen.generate(RowCount::from(total_length), rng)?;
1592            Ok(Arc::new(ListArray::try_new(
1593                self.child_field.clone(),
1594                offsets,
1595                items,
1596                None,
1597            )?))
1598        }
1599    }
1600
1601    fn data_type(&self) -> &DataType {
1602        self.field.data_type()
1603    }
1604
1605    fn element_size_bytes(&self) -> Option<ByteCount> {
1606        None
1607    }
1608}
1609
1610#[derive(Debug)]
1611struct NullArrayGenerator {}
1612
1613impl ArrayGenerator for NullArrayGenerator {
1614    fn generate(
1615        &mut self,
1616        length: RowCount,
1617        _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1618    ) -> Result<Arc<dyn Array>, ArrowError> {
1619        Ok(Arc::new(NullArray::new(length.0 as usize)))
1620    }
1621
1622    fn data_type(&self) -> &DataType {
1623        &DataType::Null
1624    }
1625
1626    fn element_size_bytes(&self) -> Option<ByteCount> {
1627        None
1628    }
1629}
1630
1631/// Generates 2 dimensional vectors along the unit circle, with a configurable number of steps per circle.
1632#[derive(Debug)]
1633struct RadialStepGenerator {
1634    num_steps_per_circle: u32,
1635    data_field: Arc<Field>,
1636    data_type: DataType,
1637    current_step: u32,
1638}
1639
1640impl RadialStepGenerator {
1641    fn new(num_steps_per_circle: u32) -> Self {
1642        let data_field = Arc::new(Field::new("item", DataType::Float32, false));
1643        let data_type = DataType::FixedSizeList(data_field.clone(), 2);
1644        Self {
1645            num_steps_per_circle,
1646            data_field,
1647            data_type,
1648            current_step: 0,
1649        }
1650    }
1651}
1652
1653impl ArrayGenerator for RadialStepGenerator {
1654    fn generate(
1655        &mut self,
1656        length: RowCount,
1657        _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1658    ) -> Result<Arc<dyn Array>, ArrowError> {
1659        let mut values_builder = Float32Builder::with_capacity(length.0 as usize * 2);
1660        for _ in 0..length.0 {
1661            let angle = (self.current_step as f32) / (self.num_steps_per_circle as f32)
1662                * 2.0
1663                * std::f32::consts::PI;
1664            values_builder.append_value(angle.cos());
1665            values_builder.append_value(angle.sin());
1666            self.current_step = (self.current_step + 1) % self.num_steps_per_circle;
1667        }
1668        let values = values_builder.finish();
1669        let vectors =
1670            FixedSizeListArray::try_new(self.data_field.clone(), 2, Arc::new(values), None)?;
1671        Ok(Arc::new(vectors))
1672    }
1673
1674    fn data_type(&self) -> &DataType {
1675        &self.data_type
1676    }
1677
1678    fn element_size_bytes(&self) -> Option<ByteCount> {
1679        Some(ByteCount::from(8))
1680    }
1681}
1682
1683/// Cycles through a set of centroids, adding noise to each point
1684#[derive(Debug)]
1685struct JitterCentroidsGenerator {
1686    centroids: Float32Array,
1687    dimension: u32,
1688    noise_level: f32,
1689    data_type: DataType,
1690    data_field: Arc<Field>,
1691
1692    offset: usize,
1693}
1694
1695impl JitterCentroidsGenerator {
1696    fn try_new(centroids: Arc<dyn Array>, noise_level: f32) -> Result<Self, ArrowError> {
1697        let DataType::FixedSizeList(values_field, dimension) = centroids.data_type() else {
1698            return Err(ArrowError::InvalidArgumentError(
1699                "Centroids must be a FixedSizeList".to_string(),
1700            ));
1701        };
1702        if values_field.data_type() != &DataType::Float32 {
1703            return Err(ArrowError::InvalidArgumentError(
1704                "Centroids values must be a Float32".to_string(),
1705            ));
1706        }
1707        let data_type = DataType::FixedSizeList(values_field.clone(), *dimension);
1708        Ok(Self {
1709            centroids: centroids
1710                .as_fixed_size_list()
1711                .values()
1712                .as_primitive::<Float32Type>()
1713                .clone(),
1714            dimension: *dimension as u32,
1715            noise_level,
1716            data_type,
1717            data_field: values_field.clone(),
1718            offset: 0,
1719        })
1720    }
1721}
1722
1723impl ArrayGenerator for JitterCentroidsGenerator {
1724    fn generate(
1725        &mut self,
1726        length: RowCount,
1727        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1728    ) -> Result<Arc<dyn Array>, ArrowError> {
1729        let mut values_builder =
1730            Float32Builder::with_capacity(length.0 as usize * self.dimension as usize);
1731        for _ in 0..length.0 {
1732            // Generate random N dimensional point
1733            let mut noise = (0..self.dimension as usize)
1734                .map(|_| rng.random::<f32>())
1735                .collect::<Vec<_>>();
1736            // Scale point to noise_level length
1737            let scale = self.noise_level / noise.iter().map(|v| v * v).sum::<f32>().sqrt();
1738            noise.iter_mut().for_each(|v| *v *= scale);
1739
1740            // Add noise to centroid and store in values
1741            for (i, noise) in noise.into_iter().enumerate() {
1742                let centroid_val = self.centroids.value(self.offset + i);
1743                let jittered_val = centroid_val + noise;
1744                values_builder.append_value(jittered_val);
1745            }
1746            // Advance to next centroid
1747            self.offset = (self.offset + self.dimension as usize) % self.centroids.len();
1748        }
1749        let values = values_builder.finish();
1750        let vectors = FixedSizeListArray::try_new(
1751            self.data_field.clone(),
1752            self.dimension as i32,
1753            Arc::new(values),
1754            None,
1755        )?;
1756        Ok(Arc::new(vectors))
1757    }
1758
1759    fn data_type(&self) -> &DataType {
1760        &self.data_type
1761    }
1762
1763    fn element_size_bytes(&self) -> Option<ByteCount> {
1764        Some(ByteCount::from(self.dimension as u64 * 4))
1765    }
1766}
1767#[derive(Debug)]
1768struct RandomStructGenerator {
1769    fields: Fields,
1770    data_type: DataType,
1771    child_gens: Vec<Box<dyn ArrayGenerator>>,
1772}
1773
1774impl RandomStructGenerator {
1775    fn new(fields: Fields, child_gens: Vec<Box<dyn ArrayGenerator>>) -> Self {
1776        let data_type = DataType::Struct(fields.clone());
1777        Self {
1778            fields,
1779            data_type,
1780            child_gens,
1781        }
1782    }
1783}
1784
1785impl ArrayGenerator for RandomStructGenerator {
1786    fn generate(
1787        &mut self,
1788        length: RowCount,
1789        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1790    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1791        if self.child_gens.is_empty() {
1792            // Have to create empty struct arrays specially to ensure they have the correct
1793            // row count
1794            let struct_arr = StructArray::new_empty_fields(length.0 as usize, None);
1795            return Ok(Arc::new(struct_arr));
1796        }
1797        let child_arrays = self
1798            .child_gens
1799            .iter_mut()
1800            .map(|genn| genn.generate(length, rng))
1801            .collect::<Result<Vec<_>, ArrowError>>()?;
1802        let struct_arr = StructArray::new(self.fields.clone(), child_arrays, None);
1803        Ok(Arc::new(struct_arr))
1804    }
1805
1806    fn data_type(&self) -> &DataType {
1807        &self.data_type
1808    }
1809
1810    fn element_size_bytes(&self) -> Option<ByteCount> {
1811        let mut sum = 0;
1812        for child_gen in &self.child_gens {
1813            sum += child_gen.element_size_bytes()?.0;
1814        }
1815        Some(ByteCount::from(sum))
1816    }
1817}
1818
1819/// A RecordBatchReader that generates batches of the given size from the given array generators
1820pub struct FixedSizeBatchGenerator {
1821    rng: rand_xoshiro::Xoshiro256PlusPlus,
1822    generators: Vec<Box<dyn ArrayGenerator>>,
1823    batch_size: RowCount,
1824    num_batches: BatchCount,
1825    schema: SchemaRef,
1826}
1827
1828impl FixedSizeBatchGenerator {
1829    fn new(
1830        generators: Vec<(Option<String>, Box<dyn ArrayGenerator>)>,
1831        batch_size: RowCount,
1832        num_batches: BatchCount,
1833        seed: Option<Seed>,
1834        default_null_probability: Option<f64>,
1835    ) -> Self {
1836        let mut fields = Vec::with_capacity(generators.len());
1837        for (field_index, field_gen) in generators.iter().enumerate() {
1838            let (name, genn) = field_gen;
1839            let default_name = format!("field_{}", field_index);
1840            let name = name.clone().unwrap_or(default_name);
1841            let mut field = Field::new(name, genn.data_type().clone(), true);
1842            if let Some(metadata) = genn.metadata() {
1843                field = field.with_metadata(metadata);
1844            }
1845            fields.push(field);
1846        }
1847        let mut generators = generators
1848            .into_iter()
1849            .map(|(_, genn)| genn)
1850            .collect::<Vec<_>>();
1851        if let Some(null_probability) = default_null_probability {
1852            generators = generators
1853                .into_iter()
1854                .map(|genn| genn.with_random_nulls(null_probability))
1855                .collect();
1856        }
1857        let schema = Arc::new(Schema::new(fields));
1858        Self {
1859            rng: rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
1860                seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
1861            ),
1862            generators,
1863            batch_size,
1864            num_batches,
1865            schema,
1866        }
1867    }
1868
1869    fn gen_next(&mut self) -> Result<RecordBatch, ArrowError> {
1870        let mut arrays = Vec::with_capacity(self.generators.len());
1871        for genn in self.generators.iter_mut() {
1872            let arr = genn.generate(self.batch_size, &mut self.rng)?;
1873            arrays.push(arr);
1874        }
1875        self.num_batches.0 -= 1;
1876        Ok(RecordBatch::try_new_with_options(
1877            self.schema.clone(),
1878            arrays,
1879            &RecordBatchOptions::new().with_row_count(Some(self.batch_size.0 as usize)),
1880        )
1881        .unwrap())
1882    }
1883}
1884
1885impl Iterator for FixedSizeBatchGenerator {
1886    type Item = Result<RecordBatch, ArrowError>;
1887
1888    fn next(&mut self) -> Option<Self::Item> {
1889        if self.num_batches.0 == 0 {
1890            return None;
1891        }
1892        Some(self.gen_next())
1893    }
1894}
1895
1896impl RecordBatchReader for FixedSizeBatchGenerator {
1897    fn schema(&self) -> SchemaRef {
1898        self.schema.clone()
1899    }
1900}
1901
1902/// A builder to create a record batch reader with generated data
1903///
1904/// This type is meant to be used in a fluent builder style to define the schema and generators
1905/// for a record batch reader.
1906#[derive(Default)]
1907pub struct BatchGeneratorBuilder {
1908    generators: Vec<(Option<String>, Box<dyn ArrayGenerator>)>,
1909    default_null_probability: Option<f64>,
1910    seed: Option<Seed>,
1911}
1912
1913pub enum RoundingBehavior {
1914    ExactOrErr,
1915    RoundUp,
1916    RoundDown,
1917}
1918
1919impl BatchGeneratorBuilder {
1920    /// Create a new BatchGeneratorBuilder with a default random seed
1921    pub fn new() -> Self {
1922        Default::default()
1923    }
1924
1925    /// Create a new BatchGeneratorBuilder with the given seed
1926    pub fn new_with_seed(seed: Seed) -> Self {
1927        Self {
1928            seed: Some(seed),
1929            ..Default::default()
1930        }
1931    }
1932
1933    /// Adds a new column to the generator
1934    ///
1935    /// See [`crate::generator::array`] for methods to create generators
1936    pub fn col(mut self, name: impl Into<String>, genn: Box<dyn ArrayGenerator>) -> Self {
1937        self.generators.push((Some(name.into()), genn));
1938        self
1939    }
1940
1941    /// Adds a new column to the generator with a generated unique name
1942    ///
1943    /// See [`crate::generator::array`] for methods to create generators
1944    pub fn anon_col(mut self, genn: Box<dyn ArrayGenerator>) -> Self {
1945        self.generators.push((None, genn));
1946        self
1947    }
1948
1949    pub fn into_batch_rows(self, batch_size: RowCount) -> Result<RecordBatch, ArrowError> {
1950        let mut reader = self.into_reader_rows(batch_size, BatchCount::from(1));
1951        reader
1952            .next()
1953            .expect("Asked for 1 batch but reader was empty")
1954    }
1955
1956    pub fn into_batch_bytes(
1957        self,
1958        batch_size: ByteCount,
1959        rounding: RoundingBehavior,
1960    ) -> Result<RecordBatch, ArrowError> {
1961        let mut reader = self.into_reader_bytes(batch_size, BatchCount::from(1), rounding)?;
1962        reader
1963            .next()
1964            .expect("Asked for 1 batch but reader was empty")
1965    }
1966
1967    /// Create a RecordBatchReader that generates batches of the given size (in rows)
1968    pub fn into_reader_rows(
1969        self,
1970        batch_size: RowCount,
1971        num_batches: BatchCount,
1972    ) -> impl RecordBatchReader {
1973        FixedSizeBatchGenerator::new(
1974            self.generators,
1975            batch_size,
1976            num_batches,
1977            self.seed,
1978            self.default_null_probability,
1979        )
1980    }
1981
1982    pub fn into_reader_stream(
1983        self,
1984        batch_size: RowCount,
1985        num_batches: BatchCount,
1986    ) -> (
1987        BoxStream<'static, Result<RecordBatch, ArrowError>>,
1988        Arc<Schema>,
1989    ) {
1990        // TODO: this is pretty lazy and could be optimized
1991        let reader = self.into_reader_rows(batch_size, num_batches);
1992        let schema = reader.schema();
1993        let batches = reader.collect::<Vec<_>>();
1994        (futures::stream::iter(batches).boxed(), schema)
1995    }
1996
1997    /// Create a RecordBatchReader that generates batches of the given size (in bytes)
1998    pub fn into_reader_bytes(
1999        self,
2000        batch_size_bytes: ByteCount,
2001        num_batches: BatchCount,
2002        rounding: RoundingBehavior,
2003    ) -> Result<impl RecordBatchReader, ArrowError> {
2004        let bytes_per_row = self
2005            .generators
2006            .iter()
2007            .map(|genn| genn.1.element_size_bytes().map(|byte_count| byte_count.0).ok_or(
2008                        ArrowError::NotYetImplemented("The function into_reader_bytes currently requires each array generator to have a fixed element size".to_string())
2009                )
2010            )
2011            .sum::<Result<u64, ArrowError>>()?;
2012        let mut num_rows = RowCount::from(batch_size_bytes.0 / bytes_per_row);
2013        if !batch_size_bytes.0.is_multiple_of(bytes_per_row) {
2014            match rounding {
2015                RoundingBehavior::ExactOrErr => {
2016                    return Err(ArrowError::NotYetImplemented(
2017                        format!("Exact rounding requested but not possible.  Batch size requested {}, row size: {}", batch_size_bytes.0, bytes_per_row))
2018                    );
2019                }
2020                RoundingBehavior::RoundUp => {
2021                    num_rows = RowCount::from(num_rows.0 + 1);
2022                }
2023                RoundingBehavior::RoundDown => (),
2024            }
2025        }
2026        Ok(self.into_reader_rows(num_rows, num_batches))
2027    }
2028
2029    /// Set the seed for the generator
2030    pub fn with_seed(mut self, seed: Seed) -> Self {
2031        self.seed = Some(seed);
2032        self
2033    }
2034
2035    /// Adds nulls (with the given probability) to all columns
2036    pub fn with_random_nulls(&mut self, default_null_probability: f64) {
2037        self.default_null_probability = Some(default_null_probability);
2038    }
2039}
2040
2041/// Factory for creating a single random array
2042pub struct ArrayGeneratorBuilder {
2043    generator: Box<dyn ArrayGenerator>,
2044    seed: Option<Seed>,
2045}
2046
2047impl ArrayGeneratorBuilder {
2048    fn new(generator: Box<dyn ArrayGenerator>) -> Self {
2049        Self {
2050            generator,
2051            seed: None,
2052        }
2053    }
2054
2055    /// Use the given seed for the generator
2056    pub fn with_seed(mut self, seed: Seed) -> Self {
2057        self.seed = Some(seed);
2058        self
2059    }
2060
2061    /// Generate a single array with the given length
2062    pub fn into_array_rows(
2063        mut self,
2064        length: RowCount,
2065    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
2066        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
2067            self.seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
2068        );
2069        self.generator.generate(length, &mut rng)
2070    }
2071}
2072
2073const MS_PER_DAY: i64 = 86400000;
2074
2075pub mod array {
2076
2077    use arrow::datatypes::{Int16Type, Int64Type, Int8Type};
2078    use arrow_array::types::{
2079        Decimal128Type, Decimal256Type, DurationMicrosecondType, DurationMillisecondType,
2080        DurationNanosecondType, DurationSecondType, Float16Type, Float32Type, Float64Type,
2081        UInt16Type, UInt32Type, UInt64Type, UInt8Type,
2082    };
2083    use arrow_array::{
2084        ArrowNativeTypeOp, BooleanArray, Date32Array, Date64Array, Time32MillisecondArray,
2085        Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
2086        TimestampMicrosecondArray, TimestampNanosecondArray, TimestampSecondArray,
2087    };
2088    use arrow_schema::{IntervalUnit, TimeUnit};
2089    use chrono::Utc;
2090    use rand::prelude::Distribution;
2091
2092    use super::*;
2093
2094    /// Create a generator of vectors by continuously calling the given generator
2095    ///
2096    /// For example, given a step generator and a dimension of 3 this will generate vectors like
2097    /// [0, 1, 2], [3, 4, 5], [6, 7, 8], ...
2098    pub fn cycle_vec(
2099        generator: Box<dyn ArrayGenerator>,
2100        dimension: Dimension,
2101    ) -> Box<dyn ArrayGenerator> {
2102        Box::new(CycleVectorGenerator::new(generator, dimension))
2103    }
2104
2105    /// Create a generator of list vectors by continuously calling the given generator
2106    ///
2107    /// The lists will have lengths uniformly distributed between `min_list_size` (inclusive) and
2108    /// `max_list_size` (exclusive).
2109    pub fn cycle_vec_var(
2110        generator: Box<dyn ArrayGenerator>,
2111        min_list_size: Dimension,
2112        max_list_size: Dimension,
2113    ) -> Box<dyn ArrayGenerator> {
2114        Box::new(CycleListGenerator::new(
2115            generator,
2116            min_list_size,
2117            max_list_size,
2118        ))
2119    }
2120
2121    /// Create a generator of vectors around unit circle
2122    ///
2123    /// Vectors will be equally spaced around the unit circle so that there are num_steps
2124    /// vectors per circle.
2125    pub fn cycle_unit_circle(num_steps: u32) -> Box<dyn ArrayGenerator> {
2126        Box::new(RadialStepGenerator::new(num_steps))
2127    }
2128
2129    /// Create a generator of vectors by cycling through a given set of vectors
2130    ///
2131    /// Each value will be spaced in slightly away from the previous value on a ball of radius jitter
2132    pub fn jitter_centroids(centroids: Arc<dyn Array>, jitter: f32) -> Box<dyn ArrayGenerator> {
2133        Box::new(JitterCentroidsGenerator::try_new(centroids, jitter).unwrap())
2134    }
2135
2136    /// Create a generator from a vector of values
2137    ///
2138    /// If more rows are requested than the length of values then it will restart
2139    /// from the beginning of the vector.
2140    pub fn cycle<DataType>(values: Vec<DataType::Native>) -> Box<dyn ArrayGenerator>
2141    where
2142        DataType::Native: Copy + 'static,
2143        DataType: ArrowPrimitiveType,
2144        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2145    {
2146        let mut values_idx = 0;
2147        Box::new(
2148            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2149                DataType::DATA_TYPE,
2150                move |_| {
2151                    let y = values[values_idx];
2152                    values_idx = (values_idx + 1) % values.len();
2153                    y
2154                },
2155                1,
2156                DataType::DATA_TYPE
2157                    .primitive_width()
2158                    .map(|width| ByteCount::from(width as u64))
2159                    .expect("Primitive types should have a fixed width"),
2160            ),
2161        )
2162    }
2163
2164    /// Create a generator from a vector of booleans
2165    ///
2166    /// If more rows are requested than the length of values then it will restart from
2167    /// the beginning of the vector
2168    pub fn cycle_bool(values: Vec<bool>) -> Box<dyn ArrayGenerator> {
2169        let mut values_idx = 0;
2170        Box::new(FnGen::<bool, BooleanArray, _>::new_unknown_size(
2171            DataType::Boolean,
2172            move |_| {
2173                let val = values[values_idx];
2174                values_idx = (values_idx + 1) % values.len();
2175                val
2176            },
2177            1,
2178        ))
2179    }
2180
2181    /// Create a generator that starts at 0 and increments by 1 for each element
2182    pub fn step<DataType>() -> Box<dyn ArrayGenerator>
2183    where
2184        DataType::Native: Copy + Default + std::ops::AddAssign<DataType::Native> + 'static,
2185        DataType: ArrowPrimitiveType,
2186        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2187    {
2188        let mut x = DataType::Native::default();
2189        Box::new(
2190            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2191                DataType::DATA_TYPE,
2192                move |_| {
2193                    let y = x;
2194                    x += DataType::Native::ONE;
2195                    y
2196                },
2197                1,
2198                DataType::DATA_TYPE
2199                    .primitive_width()
2200                    .map(|width| ByteCount::from(width as u64))
2201                    .expect("Primitive types should have a fixed width"),
2202            ),
2203        )
2204    }
2205
2206    pub fn blob() -> Box<dyn ArrayGenerator> {
2207        let mut blob_meta = HashMap::new();
2208        blob_meta.insert("lance-encoding:blob".to_string(), "true".to_string());
2209        rand_fixedbin(ByteCount::from(4 * 1024 * 1024), true).with_metadata(blob_meta)
2210    }
2211
2212    /// Create a generator that starts at a given value and increments by a given step for each element
2213    pub fn step_custom<DataType>(
2214        start: DataType::Native,
2215        step: DataType::Native,
2216    ) -> Box<dyn ArrayGenerator>
2217    where
2218        DataType::Native: Copy + Default + std::ops::AddAssign<DataType::Native> + 'static,
2219        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2220        DataType: ArrowPrimitiveType,
2221    {
2222        let mut x = start;
2223        Box::new(
2224            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2225                DataType::DATA_TYPE,
2226                move |_| {
2227                    let y = x;
2228                    x += step;
2229                    y
2230                },
2231                1,
2232                DataType::DATA_TYPE
2233                    .primitive_width()
2234                    .map(|width| ByteCount::from(width as u64))
2235                    .expect("Primitive types should have a fixed width"),
2236            ),
2237        )
2238    }
2239
2240    /// Create a generator that fills each element with the given primitive value
2241    pub fn fill<DataType>(value: DataType::Native) -> Box<dyn ArrayGenerator>
2242    where
2243        DataType::Native: Copy + 'static,
2244        DataType: ArrowPrimitiveType,
2245        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2246    {
2247        Box::new(
2248            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2249                DataType::DATA_TYPE,
2250                move |_| value,
2251                1,
2252                DataType::DATA_TYPE
2253                    .primitive_width()
2254                    .map(|width| ByteCount::from(width as u64))
2255                    .expect("Primitive types should have a fixed width"),
2256            ),
2257        )
2258    }
2259
2260    /// Create a generator that fills each element with the given binary value
2261    pub fn fill_varbin(value: Vec<u8>) -> Box<dyn ArrayGenerator> {
2262        Box::new(FixedBinaryGenerator::<BinaryType>::new(value))
2263    }
2264
2265    /// Create a generator that fills each element with the given string value
2266    pub fn fill_utf8(value: String) -> Box<dyn ArrayGenerator> {
2267        Box::new(FixedBinaryGenerator::<Utf8Type>::new(value.into_bytes()))
2268    }
2269
2270    pub fn cycle_utf8_literals(values: &[&'static str]) -> Box<dyn ArrayGenerator> {
2271        Box::new(CycleBinaryGenerator::<Utf8Type>::from_strings(values))
2272    }
2273
2274    /// Create a generator of primitive values that are randomly sampled from the entire range available for the value
2275    pub fn rand<DataType>() -> Box<dyn ArrayGenerator>
2276    where
2277        DataType::Native: Copy + 'static,
2278        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2279        DataType: ArrowPrimitiveType,
2280        rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2281    {
2282        Box::new(
2283            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2284                DataType::DATA_TYPE,
2285                move |rng| rng.random(),
2286                1,
2287                DataType::DATA_TYPE
2288                    .primitive_width()
2289                    .map(|width| ByteCount::from(width as u64))
2290                    .expect("Primitive types should have a fixed width"),
2291            ),
2292        )
2293    }
2294
2295    /// Create a generator of primitive values that are randomly sampled from the entire range available for the value
2296    pub fn rand_with_distribution<
2297        DataType,
2298        Dist: rand::distr::Distribution<DataType::Native> + Clone + Send + Sync + 'static,
2299    >(
2300        dist: Dist,
2301    ) -> Box<dyn ArrayGenerator>
2302    where
2303        DataType::Native: Copy + 'static,
2304        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2305        DataType: ArrowPrimitiveType,
2306    {
2307        Box::new(
2308            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2309                DataType::DATA_TYPE,
2310                move |rng| rng.sample(dist.clone()),
2311                1,
2312                DataType::DATA_TYPE
2313                    .primitive_width()
2314                    .map(|width| ByteCount::from(width as u64))
2315                    .expect("Primitive types should have a fixed width"),
2316            ),
2317        )
2318    }
2319
2320    /// Create a generator of 1d vectors (of a primitive type) consisting of randomly sampled primitive values
2321    pub fn rand_vec<DataType>(dimension: Dimension) -> Box<dyn ArrayGenerator>
2322    where
2323        DataType::Native: Copy + 'static,
2324        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2325        DataType: ArrowPrimitiveType,
2326        rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2327    {
2328        let underlying = rand::<DataType>();
2329        cycle_vec(underlying, dimension)
2330    }
2331
2332    /// Create a generator of 1d vectors (of a primitive type) consisting of randomly sampled nullable values
2333    pub fn rand_vec_nullable<DataType>(
2334        dimension: Dimension,
2335        null_probability: f64,
2336    ) -> Box<dyn ArrayGenerator>
2337    where
2338        DataType::Native: Copy + 'static,
2339        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2340        DataType: ArrowPrimitiveType,
2341        rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2342    {
2343        let underlying = rand::<DataType>().with_random_nulls(null_probability);
2344        cycle_vec(underlying, dimension)
2345    }
2346
2347    /// Create a generator of randomly sampled time32 values covering the entire
2348    /// range of 1 day
2349    pub fn rand_time32(resolution: &TimeUnit) -> Box<dyn ArrayGenerator> {
2350        let start = 0;
2351        let end = match resolution {
2352            TimeUnit::Second => 86_400,
2353            TimeUnit::Millisecond => 86_400_000,
2354            _ => panic!(),
2355        };
2356
2357        let data_type = DataType::Time32(*resolution);
2358        let size = ByteCount::from(data_type.primitive_width().unwrap() as u64);
2359        let dist = Uniform::new(start, end).unwrap();
2360        let sample_fn = move |rng: &mut _| dist.sample(rng);
2361
2362        match resolution {
2363            TimeUnit::Second => Box::new(FnGen::<i32, Time32SecondArray, _>::new_known_size(
2364                data_type, sample_fn, 1, size,
2365            )),
2366            TimeUnit::Millisecond => {
2367                Box::new(FnGen::<i32, Time32MillisecondArray, _>::new_known_size(
2368                    data_type, sample_fn, 1, size,
2369                ))
2370            }
2371            _ => panic!(),
2372        }
2373    }
2374
2375    /// Create a generator of randomly sampled time64 values covering the entire
2376    /// range of 1 day
2377    pub fn rand_time64(resolution: &TimeUnit) -> Box<dyn ArrayGenerator> {
2378        let start = 0_i64;
2379        let end: i64 = match resolution {
2380            TimeUnit::Microsecond => 86_400_000,
2381            TimeUnit::Nanosecond => 86_400_000_000,
2382            _ => panic!(),
2383        };
2384
2385        let data_type = DataType::Time64(*resolution);
2386        let size = ByteCount::from(data_type.primitive_width().unwrap() as u64);
2387        let dist = Uniform::new(start, end).unwrap();
2388        let sample_fn = move |rng: &mut _| dist.sample(rng);
2389
2390        match resolution {
2391            TimeUnit::Microsecond => {
2392                Box::new(FnGen::<i64, Time64MicrosecondArray, _>::new_known_size(
2393                    data_type, sample_fn, 1, size,
2394                ))
2395            }
2396            TimeUnit::Nanosecond => {
2397                Box::new(FnGen::<i64, Time64NanosecondArray, _>::new_known_size(
2398                    data_type, sample_fn, 1, size,
2399                ))
2400            }
2401            _ => panic!(),
2402        }
2403    }
2404
2405    /// Create a generator of random UUIDs, stored as fixed size binary values
2406    ///
2407    /// Note, these are "pseudo UUIDs".  They are 16-byte randomish values but they
2408    /// are not guaranteed to be unique.  We use a simplistic RNG that trades uniqueness
2409    /// for speed.
2410    pub fn rand_pseudo_uuid() -> Box<dyn ArrayGenerator> {
2411        Box::<PseudoUuidGenerator>::default()
2412    }
2413
2414    /// Create a generator of random UUIDs, stored as 32-character strings (hex encoding
2415    /// of the 16-byte binary value)
2416    ///
2417    /// Note, these are "pseudo UUIDs".  They are 16-byte randomish values but they
2418    /// are not guaranteed to be unique.  We use a simplistic RNG that trades uniqueness
2419    /// for speed.
2420    pub fn rand_pseudo_uuid_hex() -> Box<dyn ArrayGenerator> {
2421        Box::<PseudoUuidHexGenerator>::default()
2422    }
2423
2424    pub fn rand_primitive<T: ArrowPrimitiveType + Send + Sync>(
2425        data_type: DataType,
2426    ) -> Box<dyn ArrayGenerator> {
2427        Box::new(RandomBytesGenerator::<T>::new(data_type))
2428    }
2429
2430    pub fn rand_fsb(size: i32) -> Box<dyn ArrayGenerator> {
2431        Box::new(RandomFixedSizeBinaryGenerator::new(size))
2432    }
2433
2434    pub fn rand_interval(unit: IntervalUnit) -> Box<dyn ArrayGenerator> {
2435        Box::new(RandomIntervalGenerator::new(unit))
2436    }
2437
2438    /// Create a generator of randomly sampled date32 values
2439    ///
2440    /// Instead of sampling the entire range, all values will be drawn from the last year as this
2441    /// is a more common use pattern
2442    pub fn rand_date32() -> Box<dyn ArrayGenerator> {
2443        let now = chrono::Utc::now();
2444        let one_year_ago = now - chrono::TimeDelta::try_days(365).expect("TimeDelta try days");
2445        rand_date32_in_range(one_year_ago, now)
2446    }
2447
2448    /// Create a generator of randomly sampled date32 values in the given range
2449    pub fn rand_date32_in_range(
2450        start: chrono::DateTime<Utc>,
2451        end: chrono::DateTime<Utc>,
2452    ) -> Box<dyn ArrayGenerator> {
2453        let data_type = DataType::Date32;
2454        let end_ms = end.timestamp_millis();
2455        let end_days = (end_ms / MS_PER_DAY) as i32;
2456        let start_ms = start.timestamp_millis();
2457        let start_days = (start_ms / MS_PER_DAY) as i32;
2458        let dist = Uniform::new(start_days, end_days).unwrap();
2459
2460        Box::new(FnGen::<i32, Date32Array, _>::new_known_size(
2461            data_type,
2462            move |rng| dist.sample(rng),
2463            1,
2464            DataType::Date32
2465                .primitive_width()
2466                .map(|width| ByteCount::from(width as u64))
2467                .expect("Date32 should have a fixed width"),
2468        ))
2469    }
2470
2471    /// Create a generator of randomly sampled date64 values
2472    ///
2473    /// Instead of sampling the entire range, all values will be drawn from the last year as this
2474    /// is a more common use pattern
2475    pub fn rand_date64() -> Box<dyn ArrayGenerator> {
2476        let now = chrono::Utc::now();
2477        let one_year_ago = now - chrono::TimeDelta::try_days(365).expect("TimeDelta try_days");
2478        rand_date64_in_range(one_year_ago, now)
2479    }
2480
2481    /// Create a generator of randomly sampled timestamp values in the given range
2482    ///
2483    /// Currently just samples the entire range of u64 values and casts to timestamp
2484    pub fn rand_timestamp_in_range(
2485        start: chrono::DateTime<Utc>,
2486        end: chrono::DateTime<Utc>,
2487        data_type: &DataType,
2488    ) -> Box<dyn ArrayGenerator> {
2489        let end_ms = end.timestamp_millis();
2490        let start_ms = start.timestamp_millis();
2491        let (start_ticks, end_ticks) = match data_type {
2492            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
2493                (start_ms * 1000 * 1000, end_ms * 1000 * 1000)
2494            }
2495            DataType::Timestamp(TimeUnit::Microsecond, _) => (start_ms * 1000, end_ms * 1000),
2496            DataType::Timestamp(TimeUnit::Millisecond, _) => (start_ms, end_ms),
2497            DataType::Timestamp(TimeUnit::Second, _) => (start.timestamp(), end.timestamp()),
2498            _ => panic!(),
2499        };
2500        let dist = Uniform::new(start_ticks, end_ticks).unwrap();
2501
2502        let data_type = data_type.clone();
2503        let sample_fn = move |rng: &mut _| dist.sample(rng);
2504        let width = data_type
2505            .primitive_width()
2506            .map(|width| ByteCount::from(width as u64))
2507            .unwrap();
2508
2509        match data_type {
2510            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
2511                Box::new(FnGen::<i64, TimestampNanosecondArray, _>::new_known_size(
2512                    data_type, sample_fn, 1, width,
2513                ))
2514            }
2515            DataType::Timestamp(TimeUnit::Microsecond, _) => {
2516                Box::new(FnGen::<i64, TimestampMicrosecondArray, _>::new_known_size(
2517                    data_type, sample_fn, 1, width,
2518                ))
2519            }
2520            DataType::Timestamp(TimeUnit::Millisecond, _) => {
2521                Box::new(FnGen::<i64, TimestampMicrosecondArray, _>::new_known_size(
2522                    data_type, sample_fn, 1, width,
2523                ))
2524            }
2525            DataType::Timestamp(TimeUnit::Second, _) => {
2526                Box::new(FnGen::<i64, TimestampSecondArray, _>::new_known_size(
2527                    data_type, sample_fn, 1, width,
2528                ))
2529            }
2530            _ => panic!(),
2531        }
2532    }
2533
2534    pub fn rand_timestamp(data_type: &DataType) -> Box<dyn ArrayGenerator> {
2535        let now = chrono::Utc::now();
2536        let one_year_ago = now - chrono::Duration::try_days(365).unwrap();
2537        rand_timestamp_in_range(one_year_ago, now, data_type)
2538    }
2539
2540    /// Create a generator of randomly sampled date64 values
2541    ///
2542    /// Instead of sampling the entire range, all values will be drawn from the last year as this
2543    /// is a more common use pattern
2544    pub fn rand_date64_in_range(
2545        start: chrono::DateTime<Utc>,
2546        end: chrono::DateTime<Utc>,
2547    ) -> Box<dyn ArrayGenerator> {
2548        let data_type = DataType::Date64;
2549        let end_ms = end.timestamp_millis();
2550        let end_days = end_ms / MS_PER_DAY;
2551        let start_ms = start.timestamp_millis();
2552        let start_days = start_ms / MS_PER_DAY;
2553        let dist = Uniform::new(start_days, end_days).unwrap();
2554
2555        Box::new(FnGen::<i64, Date64Array, _>::new_known_size(
2556            data_type,
2557            move |rng| (dist.sample(rng)) * MS_PER_DAY,
2558            1,
2559            DataType::Date64
2560                .primitive_width()
2561                .map(|width| ByteCount::from(width as u64))
2562                .expect("Date64 should have a fixed width"),
2563        ))
2564    }
2565
2566    /// Create a generator of random binary values where each value has a fixed number of bytes
2567    pub fn rand_fixedbin(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
2568        Box::new(RandomBinaryGenerator::new(
2569            bytes_per_element,
2570            false,
2571            is_large,
2572        ))
2573    }
2574
2575    /// Create a generator of random binary values where each value has a variable number of bytes
2576    ///
2577    /// The number of bytes per element will be randomly sampled from the given (inclusive) range
2578    pub fn rand_varbin(
2579        min_bytes_per_element: ByteCount,
2580        max_bytes_per_element: ByteCount,
2581    ) -> Box<dyn ArrayGenerator> {
2582        Box::new(VariableRandomBinaryGenerator::new(
2583            min_bytes_per_element,
2584            max_bytes_per_element,
2585        ))
2586    }
2587
2588    /// Create a generator of random strings
2589    ///
2590    /// All strings will consist entirely of printable ASCII characters
2591    pub fn rand_utf8(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
2592        Box::new(RandomBinaryGenerator::new(
2593            bytes_per_element,
2594            true,
2595            is_large,
2596        ))
2597    }
2598
2599    /// Creates a generator of strings with a prefix and a counter
2600    ///
2601    /// For example, if the prefix is "user_" the the strings will be "user_0", "user_1", ...
2602    pub fn utf8_prefix_plus_counter(
2603        prefix: impl Into<String>,
2604        is_large: bool,
2605    ) -> Box<dyn ArrayGenerator> {
2606        Box::new(PrefixPlusCounterGenerator::new(prefix.into(), is_large))
2607    }
2608
2609    pub fn binary_prefix_plus_counter(
2610        prefix: Arc<[u8]>,
2611        is_large: bool,
2612    ) -> Box<dyn ArrayGenerator> {
2613        Box::new(BinaryPrefixPlusCounterGenerator::new(prefix, is_large))
2614    }
2615
2616    /// Create a random generator of boolean values
2617    pub fn rand_boolean() -> Box<dyn ArrayGenerator> {
2618        Box::<RandomBooleanGenerator>::default()
2619    }
2620
2621    /// Create a generator of random sentences
2622    ///
2623    /// Generates strings containing between min_words and max_words random English words joined by spaces
2624    pub fn random_sentence(
2625        min_words: usize,
2626        max_words: usize,
2627        is_large: bool,
2628    ) -> Box<dyn ArrayGenerator> {
2629        Box::new(RandomSentenceGenerator::new(min_words, max_words, is_large))
2630    }
2631
2632    /// Create a generator of random words (one word per row)
2633    ///
2634    /// Generates strings containing a single random English word per row
2635    pub fn random_word(is_large: bool) -> Box<dyn ArrayGenerator> {
2636        Box::new(RandomWordGenerator::new(is_large))
2637    }
2638
2639    pub fn rand_list(item_type: &DataType, is_large: bool) -> Box<dyn ArrayGenerator> {
2640        let child_gen = rand_type(item_type);
2641        Box::new(RandomListGenerator::new(child_gen, is_large))
2642    }
2643
2644    pub fn rand_list_any(
2645        item_gen: Box<dyn ArrayGenerator>,
2646        is_large: bool,
2647    ) -> Box<dyn ArrayGenerator> {
2648        Box::new(RandomListGenerator::new(item_gen, is_large))
2649    }
2650
2651    pub fn rand_struct(fields: Fields) -> Box<dyn ArrayGenerator> {
2652        let child_gens = fields
2653            .iter()
2654            .map(|f| rand_type(f.data_type()))
2655            .collect::<Vec<_>>();
2656        Box::new(RandomStructGenerator::new(fields, child_gens))
2657    }
2658
2659    pub fn null_type() -> Box<dyn ArrayGenerator> {
2660        Box::new(NullArrayGenerator {})
2661    }
2662
2663    /// Create a generator of random values
2664    pub fn rand_type(data_type: &DataType) -> Box<dyn ArrayGenerator> {
2665        match data_type {
2666            DataType::Boolean => rand_boolean(),
2667            DataType::Int8 => rand::<Int8Type>(),
2668            DataType::Int16 => rand::<Int16Type>(),
2669            DataType::Int32 => rand::<Int32Type>(),
2670            DataType::Int64 => rand::<Int64Type>(),
2671            DataType::UInt8 => rand::<UInt8Type>(),
2672            DataType::UInt16 => rand::<UInt16Type>(),
2673            DataType::UInt32 => rand::<UInt32Type>(),
2674            DataType::UInt64 => rand::<UInt64Type>(),
2675            DataType::Float16 => rand_primitive::<Float16Type>(data_type.clone()),
2676            DataType::Float32 => rand::<Float32Type>(),
2677            DataType::Float64 => rand::<Float64Type>(),
2678            DataType::Decimal128(_, _) => rand_primitive::<Decimal128Type>(data_type.clone()),
2679            DataType::Decimal256(_, _) => rand_primitive::<Decimal256Type>(data_type.clone()),
2680            DataType::Utf8 => rand_utf8(ByteCount::from(12), false),
2681            DataType::LargeUtf8 => rand_utf8(ByteCount::from(12), true),
2682            DataType::Binary => rand_fixedbin(ByteCount::from(12), false),
2683            DataType::LargeBinary => rand_fixedbin(ByteCount::from(12), true),
2684            DataType::Dictionary(key_type, value_type) => {
2685                dict_type(rand_type(value_type), key_type)
2686            }
2687            DataType::FixedSizeList(child, dimension) => cycle_vec(
2688                rand_type(child.data_type()),
2689                Dimension::from(*dimension as u32),
2690            ),
2691            DataType::FixedSizeBinary(size) => rand_fsb(*size),
2692            DataType::List(child) => rand_list(child.data_type(), false),
2693            DataType::LargeList(child) => rand_list(child.data_type(), true),
2694            DataType::Duration(unit) => match unit {
2695                TimeUnit::Second => rand::<DurationSecondType>(),
2696                TimeUnit::Millisecond => rand::<DurationMillisecondType>(),
2697                TimeUnit::Microsecond => rand::<DurationMicrosecondType>(),
2698                TimeUnit::Nanosecond => rand::<DurationNanosecondType>(),
2699            },
2700            DataType::Interval(unit) => rand_interval(*unit),
2701            DataType::Date32 => rand_date32(),
2702            DataType::Date64 => rand_date64(),
2703            DataType::Time32(resolution) => rand_time32(resolution),
2704            DataType::Time64(resolution) => rand_time64(resolution),
2705            DataType::Timestamp(_, _) => rand_timestamp(data_type),
2706            DataType::Struct(fields) => rand_struct(fields.clone()),
2707            DataType::Null => null_type(),
2708            _ => unimplemented!("random generation of {}", data_type),
2709        }
2710    }
2711
2712    /// Encodes arrays generated by the underlying generator as dictionaries with the given key type
2713    ///
2714    /// Note that this may not be very realistic if the underlying generator is something like a random
2715    /// generator since most of the underlying values will be unique and the common case for dictionary
2716    /// encoding is when there is a small set of possible values.
2717    pub fn dict<K: ArrowDictionaryKeyType + Send + Sync>(
2718        generator: Box<dyn ArrayGenerator>,
2719    ) -> Box<dyn ArrayGenerator> {
2720        Box::new(DictionaryGenerator::<K>::new(generator))
2721    }
2722
2723    /// Encodes arrays generated by the underlying generator as dictionaries with the given key type
2724    pub fn dict_type(
2725        generator: Box<dyn ArrayGenerator>,
2726        key_type: &DataType,
2727    ) -> Box<dyn ArrayGenerator> {
2728        match key_type {
2729            DataType::Int8 => dict::<Int8Type>(generator),
2730            DataType::Int16 => dict::<Int16Type>(generator),
2731            DataType::Int32 => dict::<Int32Type>(generator),
2732            DataType::Int64 => dict::<Int64Type>(generator),
2733            DataType::UInt8 => dict::<UInt8Type>(generator),
2734            DataType::UInt16 => dict::<UInt16Type>(generator),
2735            DataType::UInt32 => dict::<UInt32Type>(generator),
2736            DataType::UInt64 => dict::<UInt64Type>(generator),
2737            _ => unimplemented!(),
2738        }
2739    }
2740}
2741
2742/// Create a BatchGeneratorBuilder to start generating batch data
2743pub fn gen_batch() -> BatchGeneratorBuilder {
2744    BatchGeneratorBuilder::default()
2745}
2746
2747/// Create an ArrayGeneratorBuilder to start generating array data
2748pub fn gen_array(genn: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
2749    ArrayGeneratorBuilder::new(genn)
2750}
2751
2752/// Create a BatchGeneratorBuilder with the given schema
2753///
2754/// You can add more columns or convert this into a reader immediately
2755pub fn rand(schema: &Schema) -> BatchGeneratorBuilder {
2756    let mut builder = BatchGeneratorBuilder::default();
2757    for field in schema.fields() {
2758        builder = builder.col(field.name(), array::rand_type(field.data_type()));
2759    }
2760    builder
2761}
2762
2763#[cfg(test)]
2764mod tests {
2765
2766    use arrow::datatypes::{Float32Type, Int16Type, Int8Type, UInt32Type};
2767    use arrow_array::{BooleanArray, Float32Array, Int16Array, Int32Array, Int8Array, UInt32Array};
2768
2769    use super::*;
2770
2771    #[test]
2772    fn test_step() {
2773        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2774        let mut genn = array::step::<Int32Type>();
2775        assert_eq!(
2776            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2777            Int32Array::from_iter([0, 1, 2, 3, 4])
2778        );
2779        assert_eq!(
2780            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2781            Int32Array::from_iter([5, 6, 7, 8, 9])
2782        );
2783
2784        let mut genn = array::step::<Int8Type>();
2785        assert_eq!(
2786            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2787            Int8Array::from_iter([0, 1, 2])
2788        );
2789
2790        let mut genn = array::step::<Float32Type>();
2791        assert_eq!(
2792            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2793            Float32Array::from_iter([0.0, 1.0, 2.0])
2794        );
2795
2796        let mut genn = array::step_custom::<Int16Type>(4, 8);
2797        assert_eq!(
2798            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2799            Int16Array::from_iter([4, 12, 20])
2800        );
2801        assert_eq!(
2802            *genn.generate(RowCount::from(2), &mut rng).unwrap(),
2803            Int16Array::from_iter([28, 36])
2804        );
2805    }
2806
2807    #[test]
2808    fn test_cycle() {
2809        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2810        let mut genn = array::cycle::<Int32Type>(vec![1, 2, 3]);
2811        assert_eq!(
2812            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2813            Int32Array::from_iter([1, 2, 3, 1, 2])
2814        );
2815
2816        let mut genn = array::cycle_utf8_literals(&["abc", "def", "xyz"]);
2817        assert_eq!(
2818            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2819            StringArray::from_iter_values(["abc", "def", "xyz", "abc", "def"])
2820        );
2821        assert_eq!(
2822            *genn.generate(RowCount::from(1), &mut rng).unwrap(),
2823            StringArray::from_iter_values(["xyz"])
2824        );
2825
2826        let mut genn = array::cycle_bool(vec![false, false, true]);
2827        assert_eq!(
2828            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2829            BooleanArray::from_iter(vec![false, false, true, false, false].into_iter().map(Some))
2830        );
2831        assert_eq!(
2832            *genn.generate(RowCount::from(1), &mut rng).unwrap(),
2833            BooleanArray::from_iter(vec![Some(true)])
2834        )
2835    }
2836
2837    #[test]
2838    fn test_fill() {
2839        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2840        let mut genn = array::fill::<Int32Type>(42);
2841        assert_eq!(
2842            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2843            Int32Array::from_iter([42, 42, 42])
2844        );
2845        assert_eq!(
2846            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2847            Int32Array::from_iter([42, 42, 42])
2848        );
2849
2850        let mut genn = array::fill_varbin(vec![0, 1, 2]);
2851        assert_eq!(
2852            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2853            arrow_array::BinaryArray::from_iter_values([
2854                "\x00\x01\x02",
2855                "\x00\x01\x02",
2856                "\x00\x01\x02"
2857            ])
2858        );
2859
2860        let mut genn = array::fill_utf8("xyz".to_string());
2861        assert_eq!(
2862            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2863            arrow_array::StringArray::from_iter_values(["xyz", "xyz", "xyz"])
2864        );
2865    }
2866
2867    #[test]
2868    fn test_utf8_prefix_plus_counter() {
2869        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2870        let mut genn = array::utf8_prefix_plus_counter("user_", false);
2871        assert_eq!(
2872            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2873            arrow_array::StringArray::from_iter_values(["user_0", "user_1", "user_2"])
2874        );
2875    }
2876
2877    #[test]
2878    fn test_rng() {
2879        // Note: these tests are heavily dependent on the default seed.
2880        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2881        let mut genn = array::rand::<Int32Type>();
2882        assert_eq!(
2883            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2884            Int32Array::from_iter([-797553329, 1369325940, -69174021])
2885        );
2886
2887        let mut genn = array::rand_fixedbin(ByteCount::from(3), false);
2888        assert_eq!(
2889            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2890            arrow_array::BinaryArray::from_iter_values([
2891                [184, 53, 216],
2892                [12, 96, 159],
2893                [125, 179, 56]
2894            ])
2895        );
2896
2897        let mut genn = array::rand_utf8(ByteCount::from(3), false);
2898        assert_eq!(
2899            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2900            arrow_array::StringArray::from_iter_values([">@p", "n `", "NWa"])
2901        );
2902
2903        let mut genn = array::random_sentence(1, 5, false);
2904        let words = genn.generate(RowCount::from(10), &mut rng).unwrap();
2905        assert_eq!(words.data_type(), &DataType::Utf8);
2906        let words_array = words.as_any().downcast_ref::<StringArray>().unwrap();
2907        // Verify each string contains 1-5 words
2908        for i in 0..10 {
2909            let sentence = words_array.value(i);
2910            let word_count = sentence.split_whitespace().count();
2911            assert!((1..=5).contains(&word_count));
2912        }
2913
2914        let mut genn = array::rand_date32();
2915        let days_32 = genn.generate(RowCount::from(3), &mut rng).unwrap();
2916        assert_eq!(days_32.data_type(), &DataType::Date32);
2917
2918        let mut genn = array::rand_date64();
2919        let days_64 = genn.generate(RowCount::from(3), &mut rng).unwrap();
2920        assert_eq!(days_64.data_type(), &DataType::Date64);
2921
2922        let mut genn = array::rand_boolean();
2923        let bools = genn.generate(RowCount::from(1024), &mut rng).unwrap();
2924        assert_eq!(bools.data_type(), &DataType::Boolean);
2925        let bools = bools.as_any().downcast_ref::<BooleanArray>().unwrap();
2926        // Sanity check to ensure we're getting at least some rng
2927        assert!(bools.false_count() > 100);
2928        assert!(bools.true_count() > 100);
2929
2930        let mut genn = array::rand_varbin(ByteCount::from(2), ByteCount::from(4));
2931        assert_eq!(
2932            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2933            arrow_array::BinaryArray::from_iter_values([
2934                vec![234, 107],
2935                vec![220, 152],
2936                vec![21, 16, 184, 220]
2937            ])
2938        );
2939    }
2940
2941    #[test]
2942    fn test_rng_list() {
2943        // Note: these tests are heavily dependent on the default seed.
2944        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2945        let mut genn = array::rand_list(&DataType::Int32, false);
2946        let arr = genn.generate(RowCount::from(100), &mut rng).unwrap();
2947        // Make sure we can generate empty lists (note, test is dependent on seed)
2948        let arr = arr.as_list::<i32>();
2949        assert!(arr.iter().any(|l| l.unwrap().is_empty()));
2950        // Shouldn't generate any giant lists (don't kill performance in normal datagen)
2951        assert!(arr.iter().any(|l| l.unwrap().len() < 11));
2952    }
2953
2954    #[test]
2955    fn test_rng_distribution() {
2956        // Sanity test to make sure we our RNG is giving us well distributed values
2957        // We generates some 4-byte integers, histogram them into 8 buckets, and make
2958        // sure each bucket has a good # of values
2959        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2960        let mut genn = array::rand::<UInt32Type>();
2961        for _ in 0..10 {
2962            let arr = genn.generate(RowCount::from(10000), &mut rng).unwrap();
2963            let int_arr = arr.as_any().downcast_ref::<UInt32Array>().unwrap();
2964            let mut buckets = vec![0_u32; 256];
2965            for val in int_arr.values() {
2966                buckets[(*val >> 24) as usize] += 1;
2967            }
2968            for bucket in buckets {
2969                // Perfectly even distribution would have 10000 / 256 values (~40) per bucket
2970                // We test for 15 which should be "good enough" and statistically unlikely to fail
2971                assert!(bucket > 15);
2972            }
2973        }
2974    }
2975
2976    #[test]
2977    fn test_nulls() {
2978        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2979        let mut genn = array::rand::<Int32Type>().with_random_nulls(0.3);
2980
2981        let arr = genn.generate(RowCount::from(1000), &mut rng).unwrap();
2982
2983        // This assert depends on the default seed
2984        assert_eq!(arr.null_count(), 297);
2985
2986        for len in 0..100 {
2987            let arr = genn.generate(RowCount::from(len), &mut rng).unwrap();
2988            // Make sure the null count we came up with matches the actual # of unset bits
2989            assert_eq!(
2990                arr.null_count(),
2991                arr.nulls()
2992                    .map(|nulls| (len as usize)
2993                        - nulls.buffer().count_set_bits_offset(0, len as usize))
2994                    .unwrap_or(0)
2995            );
2996        }
2997
2998        let mut genn = array::rand::<Int32Type>().with_random_nulls(0.0);
2999        let arr = genn.generate(RowCount::from(10), &mut rng).unwrap();
3000
3001        assert_eq!(arr.null_count(), 0);
3002
3003        let mut genn = array::rand::<Int32Type>().with_random_nulls(1.0);
3004        let arr = genn.generate(RowCount::from(10), &mut rng).unwrap();
3005
3006        assert_eq!(arr.null_count(), 10);
3007        assert!((0..10).all(|idx| arr.is_null(idx)));
3008
3009        let mut genn = array::rand::<Int32Type>().with_nulls(&[false, false, true]);
3010        let arr = genn.generate(RowCount::from(7), &mut rng).unwrap();
3011        assert!((0..2).all(|idx| arr.is_valid(idx)));
3012        assert!(arr.is_null(2));
3013        assert!((3..5).all(|idx| arr.is_valid(idx)));
3014        assert!(arr.is_null(5));
3015        assert!(arr.is_valid(6));
3016    }
3017
3018    #[test]
3019    fn test_unit_circle() {
3020        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3021        let mut genn = array::cycle_unit_circle(4);
3022        let arr = genn.generate(RowCount::from(6), &mut rng).unwrap();
3023
3024        let arr_values = arr
3025            .as_fixed_size_list()
3026            .values()
3027            .as_primitive::<Float32Type>()
3028            .values()
3029            .to_vec();
3030        assert_eq!(arr_values.len(), 12);
3031        let expected_values = [1.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0, -1.0, 1.0, 0.0, 0.0, 1.0];
3032        for (actual, expected) in arr_values.iter().zip(expected_values.iter()) {
3033            assert!((actual - expected).abs() < 0.0001);
3034        }
3035    }
3036
3037    #[test]
3038    fn test_jitter_centroids() {
3039        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3040        let mut centroids_gen = array::cycle_unit_circle(4);
3041        let centroids = centroids_gen.generate(RowCount::from(4), &mut rng).unwrap();
3042
3043        let centroid_values = centroids
3044            .as_fixed_size_list()
3045            .values()
3046            .as_primitive::<Float32Type>()
3047            .values()
3048            .to_vec();
3049
3050        let mut jitter_jen = array::jitter_centroids(centroids, 0.001);
3051        let jittered = jitter_jen.generate(RowCount::from(100), &mut rng).unwrap();
3052
3053        let values = jittered
3054            .as_fixed_size_list()
3055            .values()
3056            .as_primitive::<Float32Type>()
3057            .values()
3058            .to_vec();
3059
3060        for i in 0..100 {
3061            let centroid = i % 4;
3062            let centroid_x = centroid_values[centroid * 2];
3063            let centroid_y = centroid_values[centroid * 2 + 1];
3064            let value_x = values[i * 2];
3065            let value_y = values[i * 2 + 1];
3066
3067            let l2_dist = ((value_x - centroid_x).powi(2) + (value_y - centroid_y).powi(2)).sqrt();
3068            assert!(l2_dist < 0.001001);
3069            assert!(l2_dist > 0.000999);
3070        }
3071    }
3072
3073    #[test]
3074    fn test_rand_schema() {
3075        let schema = Schema::new(vec![
3076            Field::new("a", DataType::Int32, true),
3077            Field::new("b", DataType::Utf8, true),
3078            Field::new("c", DataType::Float32, true),
3079            Field::new("d", DataType::Int32, true),
3080            Field::new("e", DataType::Int32, true),
3081        ]);
3082        let rbr = rand(&schema)
3083            .into_reader_bytes(
3084                ByteCount::from(1024 * 1024),
3085                BatchCount::from(8),
3086                RoundingBehavior::ExactOrErr,
3087            )
3088            .unwrap();
3089        assert_eq!(*rbr.schema(), schema);
3090
3091        let batches = rbr.map(|val| val.unwrap()).collect::<Vec<_>>();
3092        assert_eq!(batches.len(), 8);
3093
3094        for batch in batches {
3095            assert_eq!(batch.num_rows(), 1024 * 1024 / 32);
3096            assert_eq!(batch.num_columns(), 5);
3097        }
3098    }
3099}