Skip to main content

lance_datagen/
generator.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{collections::HashMap, iter, marker::PhantomData, sync::Arc, sync::LazyLock};
5
6use arrow::{
7    array::{ArrayData, AsArray, Float32Builder, GenericBinaryBuilder, GenericStringBuilder},
8    buffer::{BooleanBuffer, Buffer, OffsetBuffer, ScalarBuffer},
9    datatypes::{
10        ArrowPrimitiveType, Float32Type, Int32Type, Int64Type, IntervalDayTime,
11        IntervalMonthDayNano, UInt32Type,
12    },
13};
14use arrow_array::{
15    Array, BinaryArray, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, LargeListArray,
16    LargeStringArray, ListArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, RecordBatch,
17    RecordBatchOptions, RecordBatchReader, StringArray, StructArray, make_array,
18    types::{ArrowDictionaryKeyType, BinaryType, ByteArrayType, Utf8Type},
19};
20use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef};
21use futures::{StreamExt, stream::BoxStream};
22use rand::{Rng, RngCore, SeedableRng, distr::Uniform};
23use rand_distr::Zipf;
24
25use self::array::rand_with_distribution;
26
27#[derive(Copy, Clone, Debug, Default)]
28pub struct RowCount(u64);
29#[derive(Copy, Clone, Debug, Default)]
30pub struct BatchCount(u32);
31#[derive(Copy, Clone, Debug, Default)]
32pub struct ByteCount(u64);
33#[derive(Copy, Clone, Debug, Default)]
34pub struct Dimension(u32);
35
36impl From<u32> for BatchCount {
37    fn from(n: u32) -> Self {
38        Self(n)
39    }
40}
41
42impl From<u64> for RowCount {
43    fn from(n: u64) -> Self {
44        Self(n)
45    }
46}
47
48impl From<u64> for ByteCount {
49    fn from(n: u64) -> Self {
50        Self(n)
51    }
52}
53
54impl From<u32> for Dimension {
55    fn from(n: u32) -> Self {
56        Self(n)
57    }
58}
59
60/// A trait for anything that can generate arrays of data
61pub trait ArrayGenerator: Send + Sync + std::fmt::Debug {
62    /// Generate an array of the given length
63    ///
64    /// # Arguments
65    ///
66    /// * `length` - The number of elements to generate
67    /// * `rng` - The random number generator to use
68    ///
69    /// # Returns
70    ///
71    /// An array of the given length
72    ///
73    /// Note: Not every generator needs an rng.  However, it is passed here because many do and this
74    /// lets us manage RNGs at the batch level instead of the array level.
75    fn generate(
76        &mut self,
77        length: RowCount,
78        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
79    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError>;
80
81    /// Generate an array of the given length using a new RNG with the default seed
82    ///
83    /// # Arguments
84    ///
85    /// * `length` - The number of elements to generate
86    ///
87    /// # Returns
88    ///
89    /// An array of the given length
90    fn generate_default(
91        &mut self,
92        length: RowCount,
93    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
94        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
95        Self::generate(self, length, &mut rng)
96    }
97    /// Get the data type of the array that this generator produces
98    ///
99    /// # Returns
100    ///
101    /// The data type of the array that this generator produces
102    fn data_type(&self) -> &DataType;
103    /// Gets metadata that should be associated with the field generated by this generator
104    fn metadata(&self) -> Option<HashMap<String, String>> {
105        None
106    }
107    /// Get the size of each element in bytes
108    ///
109    /// # Returns
110    ///
111    /// The size of each element in bytes.  Will be None if the size varies by element.
112    fn element_size_bytes(&self) -> Option<ByteCount>;
113}
114
115#[derive(Debug)]
116pub struct CycleNullGenerator {
117    generator: Box<dyn ArrayGenerator>,
118    validity: Vec<bool>,
119    idx: usize,
120}
121#[derive(Debug)]
122pub struct CycleNanGenerator {
123    generator: Box<dyn ArrayGenerator>,
124    nan_pattern: Vec<bool>,
125    idx: usize,
126}
127
128impl ArrayGenerator for CycleNanGenerator {
129    fn generate(
130        &mut self,
131        length: RowCount,
132        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
133    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
134        let array = self.generator.generate(length, rng)?;
135
136        // Only apply NaN pattern to float types
137        match array.data_type() {
138            DataType::Float16 => {
139                let float_array = array
140                    .as_any()
141                    .downcast_ref::<arrow_array::Float16Array>()
142                    .unwrap();
143                let mut values: Vec<half::f16> = float_array.values().to_vec();
144
145                for (i, &should_be_nan) in self
146                    .nan_pattern
147                    .iter()
148                    .cycle()
149                    .skip(self.idx)
150                    .take(length.0 as usize)
151                    .enumerate()
152                {
153                    if should_be_nan {
154                        values[i] = half::f16::NAN;
155                    }
156                }
157
158                self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
159                Ok(Arc::new(arrow_array::Float16Array::from(values)))
160            }
161            DataType::Float32 => {
162                let float_array = array
163                    .as_any()
164                    .downcast_ref::<arrow_array::Float32Array>()
165                    .unwrap();
166                let mut values: Vec<f32> = float_array.values().to_vec();
167
168                for (i, &should_be_nan) in self
169                    .nan_pattern
170                    .iter()
171                    .cycle()
172                    .skip(self.idx)
173                    .take(length.0 as usize)
174                    .enumerate()
175                {
176                    if should_be_nan {
177                        values[i] = f32::NAN;
178                    }
179                }
180
181                self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
182                Ok(Arc::new(arrow_array::Float32Array::from(values)))
183            }
184            DataType::Float64 => {
185                let float_array = array
186                    .as_any()
187                    .downcast_ref::<arrow_array::Float64Array>()
188                    .unwrap();
189                let mut values: Vec<f64> = float_array.values().to_vec();
190
191                for (i, &should_be_nan) in self
192                    .nan_pattern
193                    .iter()
194                    .cycle()
195                    .skip(self.idx)
196                    .take(length.0 as usize)
197                    .enumerate()
198                {
199                    if should_be_nan {
200                        values[i] = f64::NAN;
201                    }
202                }
203
204                self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
205                Ok(Arc::new(arrow_array::Float64Array::from(values)))
206            }
207            _ => {
208                // For non-float types, just return the original array unchanged
209                Ok(array)
210            }
211        }
212    }
213
214    fn data_type(&self) -> &DataType {
215        self.generator.data_type()
216    }
217
218    fn element_size_bytes(&self) -> Option<ByteCount> {
219        self.generator.element_size_bytes()
220    }
221}
222
223impl ArrayGenerator for CycleNullGenerator {
224    fn generate(
225        &mut self,
226        length: RowCount,
227        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
228    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
229        let array = self.generator.generate(length, rng)?;
230        let data = array.to_data();
231        let validity_itr = self
232            .validity
233            .iter()
234            .cycle()
235            .skip(self.idx)
236            .take(length.0 as usize)
237            .copied();
238        let validity_bitmap = BooleanBuffer::from_iter(validity_itr);
239
240        self.idx = (self.idx + (length.0 as usize)) % self.validity.len();
241        unsafe {
242            let new_data = ArrayData::new_unchecked(
243                data.data_type().clone(),
244                data.len(),
245                None,
246                Some(validity_bitmap.into_inner()),
247                data.offset(),
248                data.buffers().to_vec(),
249                data.child_data().into(),
250            );
251            Ok(make_array(new_data))
252        }
253    }
254
255    fn data_type(&self) -> &DataType {
256        self.generator.data_type()
257    }
258
259    fn element_size_bytes(&self) -> Option<ByteCount> {
260        self.generator.element_size_bytes()
261    }
262}
263
264#[derive(Debug)]
265pub struct MetadataGenerator {
266    generator: Box<dyn ArrayGenerator>,
267    metadata: HashMap<String, String>,
268}
269
270impl ArrayGenerator for MetadataGenerator {
271    fn generate(
272        &mut self,
273        length: RowCount,
274        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
275    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
276        self.generator.generate(length, rng)
277    }
278
279    fn metadata(&self) -> Option<HashMap<String, String>> {
280        Some(self.metadata.clone())
281    }
282
283    fn data_type(&self) -> &DataType {
284        self.generator.data_type()
285    }
286
287    fn element_size_bytes(&self) -> Option<ByteCount> {
288        self.generator.element_size_bytes()
289    }
290}
291
292#[derive(Debug)]
293pub struct NullGenerator {
294    generator: Box<dyn ArrayGenerator>,
295    null_probability: f64,
296}
297
298impl ArrayGenerator for NullGenerator {
299    fn generate(
300        &mut self,
301        length: RowCount,
302        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
303    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
304        let array = self.generator.generate(length, rng)?;
305        let data = array.to_data();
306
307        if self.null_probability < 0.0 || self.null_probability > 1.0 {
308            return Err(ArrowError::InvalidArgumentError(format!(
309                "null_probability must be between 0 and 1, got {}",
310                self.null_probability
311            )));
312        }
313
314        let (null_count, new_validity) = if self.null_probability == 0.0 {
315            if data.null_count() == 0 {
316                return Ok(array);
317            } else {
318                (0_usize, None)
319            }
320        } else if self.null_probability == 1.0 {
321            if data.null_count() == data.len() {
322                return Ok(array);
323            } else {
324                let all_nulls = BooleanBuffer::new_unset(array.len());
325                (array.len(), Some(all_nulls.into_inner()))
326            }
327        } else {
328            let array_len = array.len();
329            let num_validity_bytes = array_len.div_ceil(8);
330            let mut null_count = 0;
331            // Sampling the RNG once per bit is kind of slow so we do this to sample once
332            // per byte.  We only get 8 bits of RNG resolution but that should be good enough.
333            let threshold = (self.null_probability * u8::MAX as f64) as u8;
334            let bytes = (0..num_validity_bytes)
335                .map(|byte_idx| {
336                    let mut sample = rng.random::<u64>();
337                    let mut byte: u8 = 0;
338                    for bit_idx in 0..8 {
339                        // We could probably overshoot and fill in extra bits with random data but
340                        // this is cleaner and that would mess up the null count
341                        byte <<= 1;
342                        let pos = byte_idx * 8 + (7 - bit_idx);
343                        if pos < array_len {
344                            let sample_piece = sample & 0xFF;
345                            let is_null = (sample_piece as u8) < threshold;
346                            byte |= (!is_null) as u8;
347                            null_count += is_null as usize;
348                        }
349                        sample >>= 8;
350                    }
351                    byte
352                })
353                .collect::<Vec<_>>();
354            let new_validity = Buffer::from_iter(bytes);
355            (null_count, Some(new_validity))
356        };
357
358        unsafe {
359            let new_data = ArrayData::new_unchecked(
360                data.data_type().clone(),
361                data.len(),
362                Some(null_count),
363                new_validity,
364                data.offset(),
365                data.buffers().to_vec(),
366                data.child_data().into(),
367            );
368            Ok(make_array(new_data))
369        }
370    }
371
372    fn metadata(&self) -> Option<HashMap<String, String>> {
373        self.generator.metadata()
374    }
375
376    fn data_type(&self) -> &DataType {
377        self.generator.data_type()
378    }
379
380    fn element_size_bytes(&self) -> Option<ByteCount> {
381        self.generator.element_size_bytes()
382    }
383}
384
385pub trait ArrayGeneratorExt {
386    /// Replaces the validity bitmap of generated arrays, inserting nulls with a given probability
387    fn with_random_nulls(self, null_probability: f64) -> Box<dyn ArrayGenerator>;
388    /// Replaces the validity bitmap of generated arrays with the inverse of `nulls`, cycling if needed
389    fn with_nulls(self, nulls: &[bool]) -> Box<dyn ArrayGenerator>;
390    /// Replaces the values of generated arrays with NaN values, cycling if needed
391    ///
392    /// Will have no effect if the data type is not a floating point data type
393    fn with_nans(self, nans: &[bool]) -> Box<dyn ArrayGenerator>;
394    /// Replaces the validity bitmap of generated arrays with `validity`, cycling if needed
395    fn with_validity(self, nulls: &[bool]) -> Box<dyn ArrayGenerator>;
396    fn with_metadata(self, metadata: HashMap<String, String>) -> Box<dyn ArrayGenerator>;
397}
398
399impl ArrayGeneratorExt for Box<dyn ArrayGenerator> {
400    fn with_random_nulls(self, null_probability: f64) -> Box<dyn ArrayGenerator> {
401        Box::new(NullGenerator {
402            generator: self,
403            null_probability,
404        })
405    }
406
407    fn with_nulls(self, nulls: &[bool]) -> Box<dyn ArrayGenerator> {
408        Box::new(CycleNullGenerator {
409            generator: self,
410            validity: nulls.iter().map(|v| !*v).collect(),
411            idx: 0,
412        })
413    }
414
415    fn with_nans(self, nans: &[bool]) -> Box<dyn ArrayGenerator> {
416        Box::new(CycleNanGenerator {
417            generator: self,
418            nan_pattern: nans.to_vec(),
419            idx: 0,
420        })
421    }
422
423    fn with_validity(self, validity: &[bool]) -> Box<dyn ArrayGenerator> {
424        Box::new(CycleNullGenerator {
425            generator: self,
426            validity: validity.to_vec(),
427            idx: 0,
428        })
429    }
430
431    fn with_metadata(self, metadata: HashMap<String, String>) -> Box<dyn ArrayGenerator> {
432        Box::new(MetadataGenerator {
433            generator: self,
434            metadata,
435        })
436    }
437}
438
439pub struct NTimesIter<I: Iterator>
440where
441    I::Item: Copy,
442{
443    iter: I,
444    n: u32,
445    cur: I::Item,
446    count: u32,
447}
448
449// Note: if this is used then there is a performance hit as the
450// inner loop cannot experience vectorization
451//
452// TODO: maybe faster to build the vec and then repeat it into
453// the destination array?
454impl<I: Iterator> Iterator for NTimesIter<I>
455where
456    I::Item: Copy,
457{
458    type Item = I::Item;
459
460    fn next(&mut self) -> Option<Self::Item> {
461        if self.count == 0 {
462            self.count = self.n - 1;
463            self.cur = self.iter.next()?;
464        } else {
465            self.count -= 1;
466        }
467        Some(self.cur)
468    }
469
470    fn size_hint(&self) -> (usize, Option<usize>) {
471        let (lower, upper) = self.iter.size_hint();
472        let lower = lower * self.n as usize;
473        let upper = upper.map(|u| u * self.n as usize);
474        (lower, upper)
475    }
476}
477
478pub struct FnGen<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T>
479where
480    T: Copy + Default,
481    ArrayType: arrow_array::Array + From<Vec<T>>,
482{
483    data_type: DataType,
484    generator: F,
485    array_type: PhantomData<ArrayType>,
486    repeat: u32,
487    leftover: T,
488    leftover_count: u32,
489    element_size_bytes: Option<ByteCount>,
490}
491
492impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> std::fmt::Debug
493    for FnGen<T, ArrayType, F>
494where
495    T: Copy + Default,
496    ArrayType: arrow_array::Array + From<Vec<T>>,
497{
498    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
499        f.debug_struct("FnGen")
500            .field("data_type", &self.data_type)
501            .field("array_type", &self.array_type)
502            .field("repeat", &self.repeat)
503            .field("leftover_count", &self.leftover_count)
504            .field("element_size_bytes", &self.element_size_bytes)
505            .finish()
506    }
507}
508
509impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> FnGen<T, ArrayType, F>
510where
511    T: Copy + Default,
512    ArrayType: arrow_array::Array + From<Vec<T>>,
513{
514    fn new_known_size(
515        data_type: DataType,
516        generator: F,
517        repeat: u32,
518        element_size_bytes: ByteCount,
519    ) -> Self {
520        Self {
521            data_type,
522            generator,
523            array_type: PhantomData,
524            repeat,
525            leftover: T::default(),
526            leftover_count: 0,
527            element_size_bytes: Some(element_size_bytes),
528        }
529    }
530
531    fn new_unknown_size(data_type: DataType, generator: F, repeat: u32) -> Self {
532        Self {
533            data_type,
534            generator,
535            array_type: PhantomData,
536            repeat,
537            leftover: T::default(),
538            leftover_count: 0,
539            element_size_bytes: None,
540        }
541    }
542}
543
544impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> ArrayGenerator
545    for FnGen<T, ArrayType, F>
546where
547    T: Copy + Default + Send + Sync,
548    ArrayType: arrow_array::Array + From<Vec<T>> + 'static,
549    F: Send + Sync,
550{
551    fn generate(
552        &mut self,
553        length: RowCount,
554        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
555    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
556        let iter = (0..length.0).map(|_| (self.generator)(rng));
557        let values = if self.repeat > 1 {
558            Vec::from_iter(
559                NTimesIter {
560                    iter,
561                    n: self.repeat,
562                    cur: self.leftover,
563                    count: self.leftover_count,
564                }
565                .take(length.0 as usize),
566            )
567        } else {
568            Vec::from_iter(iter)
569        };
570        self.leftover_count = ((self.leftover_count as u64 + length.0) % self.repeat as u64) as u32;
571        self.leftover = values.last().copied().unwrap_or(T::default());
572        Ok(Arc::new(ArrayType::from(values)))
573    }
574
575    fn data_type(&self) -> &DataType {
576        &self.data_type
577    }
578
579    fn element_size_bytes(&self) -> Option<ByteCount> {
580        self.element_size_bytes
581    }
582}
583
584#[derive(Copy, Clone, Debug)]
585pub struct Seed(pub u64);
586pub const DEFAULT_SEED: Seed = Seed(42);
587
588impl From<u64> for Seed {
589    fn from(n: u64) -> Self {
590        Self(n)
591    }
592}
593
594#[derive(Debug)]
595pub struct CycleVectorGenerator {
596    underlying_gen: Box<dyn ArrayGenerator>,
597    dimension: Dimension,
598    data_type: DataType,
599}
600
601impl CycleVectorGenerator {
602    pub fn new(underlying_gen: Box<dyn ArrayGenerator>, dimension: Dimension) -> Self {
603        let data_type = DataType::FixedSizeList(
604            Arc::new(Field::new("item", underlying_gen.data_type().clone(), true)),
605            dimension.0 as i32,
606        );
607        Self {
608            underlying_gen,
609            dimension,
610            data_type,
611        }
612    }
613}
614
615impl ArrayGenerator for CycleVectorGenerator {
616    fn generate(
617        &mut self,
618        length: RowCount,
619        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
620    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
621        let values = self
622            .underlying_gen
623            .generate(RowCount::from(length.0 * self.dimension.0 as u64), rng)?;
624        let field = Arc::new(Field::new("item", values.data_type().clone(), true));
625        let values = Arc::new(values);
626
627        let array = FixedSizeListArray::try_new(field, self.dimension.0 as i32, values, None)?;
628
629        Ok(Arc::new(array))
630    }
631
632    fn data_type(&self) -> &DataType {
633        &self.data_type
634    }
635
636    fn element_size_bytes(&self) -> Option<ByteCount> {
637        self.underlying_gen
638            .element_size_bytes()
639            .map(|byte_count| ByteCount::from(byte_count.0 * self.dimension.0 as u64))
640    }
641}
642
643#[derive(Debug)]
644pub struct CycleListGenerator {
645    underlying_gen: Box<dyn ArrayGenerator>,
646    lengths_gen: Box<dyn ArrayGenerator>,
647    data_type: DataType,
648}
649
650impl CycleListGenerator {
651    pub fn new(
652        underlying_gen: Box<dyn ArrayGenerator>,
653        min_list_size: Dimension,
654        max_list_size: Dimension,
655    ) -> Self {
656        let data_type = DataType::List(Arc::new(Field::new(
657            "item",
658            underlying_gen.data_type().clone(),
659            true,
660        )));
661        let lengths_dist = Uniform::new(min_list_size.0, max_list_size.0).unwrap();
662        let lengths_gen = rand_with_distribution::<UInt32Type, Uniform<u32>>(lengths_dist);
663        Self {
664            underlying_gen,
665            lengths_gen,
666            data_type,
667        }
668    }
669}
670
671impl ArrayGenerator for CycleListGenerator {
672    fn generate(
673        &mut self,
674        length: RowCount,
675        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
676    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
677        let lengths = self.lengths_gen.generate(length, rng)?;
678        let lengths = lengths.as_primitive::<UInt32Type>();
679        let total_length = lengths.values().iter().map(|i| *i as u64).sum::<u64>();
680        let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
681        let values = self
682            .underlying_gen
683            .generate(RowCount::from(total_length), rng)?;
684        let field = Arc::new(Field::new("item", values.data_type().clone(), true));
685        let values = Arc::new(values);
686
687        let array = ListArray::try_new(field, offsets, values, None)?;
688
689        Ok(Arc::new(array))
690    }
691
692    fn data_type(&self) -> &DataType {
693        &self.data_type
694    }
695
696    fn element_size_bytes(&self) -> Option<ByteCount> {
697        None
698    }
699}
700
701#[derive(Debug, Default)]
702pub struct PseudoUuidGenerator {}
703
704impl ArrayGenerator for PseudoUuidGenerator {
705    fn generate(
706        &mut self,
707        length: RowCount,
708        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
709    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
710        Ok(Arc::new(FixedSizeBinaryArray::try_from_iter(
711            (0..length.0).map(|_| {
712                let mut data = vec![0; 16];
713                rng.fill_bytes(&mut data);
714                data
715            }),
716        )?))
717    }
718
719    fn data_type(&self) -> &DataType {
720        &DataType::FixedSizeBinary(16)
721    }
722
723    fn element_size_bytes(&self) -> Option<ByteCount> {
724        Some(ByteCount::from(16))
725    }
726}
727
728#[derive(Debug, Default)]
729pub struct PseudoUuidHexGenerator {}
730
731impl ArrayGenerator for PseudoUuidHexGenerator {
732    fn generate(
733        &mut self,
734        length: RowCount,
735        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
736    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
737        let mut data = vec![0; 16 * length.0 as usize];
738        rng.fill_bytes(&mut data);
739        let data_hex = hex::encode(data);
740
741        Ok(Arc::new(StringArray::from_iter_values(
742            (0..length.0 as usize).map(|i| data_hex.get(i * 32..(i + 1) * 32).unwrap()),
743        )))
744    }
745
746    fn data_type(&self) -> &DataType {
747        &DataType::Utf8
748    }
749
750    fn element_size_bytes(&self) -> Option<ByteCount> {
751        Some(ByteCount::from(16))
752    }
753}
754
755#[derive(Debug, Default)]
756pub struct RandomBooleanGenerator {}
757
758impl ArrayGenerator for RandomBooleanGenerator {
759    fn generate(
760        &mut self,
761        length: RowCount,
762        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
763    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
764        let num_bytes = length.0.div_ceil(8);
765        let mut bytes = vec![0; num_bytes as usize];
766        rng.fill_bytes(&mut bytes);
767        let bytes = BooleanBuffer::new(Buffer::from(bytes), 0, length.0 as usize);
768        Ok(Arc::new(arrow_array::BooleanArray::new(bytes, None)))
769    }
770
771    fn data_type(&self) -> &DataType {
772        &DataType::Boolean
773    }
774
775    fn element_size_bytes(&self) -> Option<ByteCount> {
776        // We can't say 1/8th of a byte and 1 byte would be a pretty extreme over-count so let's leave
777        // it at None until someone needs this.  Then we can probably special case this (e.g. make a ByteCount::ONE_BIT)
778        None
779    }
780}
781
782// Instead of using the "standard distribution" and generating values there are some cases (e.g. f16 / decimal)
783// where we just generate random bytes because there is no rand support
784pub struct RandomBytesGenerator<T: ArrowPrimitiveType + Send + Sync> {
785    phantom: PhantomData<T>,
786    data_type: DataType,
787}
788
789impl<T: ArrowPrimitiveType + Send + Sync> std::fmt::Debug for RandomBytesGenerator<T> {
790    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
791        f.debug_struct("RandomBytesGenerator")
792            .field("data_type", &self.data_type)
793            .finish()
794    }
795}
796
797impl<T: ArrowPrimitiveType + Send + Sync> RandomBytesGenerator<T> {
798    fn new(data_type: DataType) -> Self {
799        Self {
800            phantom: Default::default(),
801            data_type,
802        }
803    }
804
805    fn byte_width() -> Result<u64, ArrowError> {
806        T::DATA_TYPE.primitive_width().ok_or_else(|| ArrowError::InvalidArgumentError(format!("Cannot generate the data type {} with the RandomBytesGenerator because it is not a fixed-width bytes type", T::DATA_TYPE))).map(|val| val as u64)
807    }
808}
809
810impl<T: ArrowPrimitiveType + Send + Sync> ArrayGenerator for RandomBytesGenerator<T> {
811    fn generate(
812        &mut self,
813        length: RowCount,
814        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
815    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
816        let num_bytes = length.0 * Self::byte_width()?;
817        let mut bytes = vec![0; num_bytes as usize];
818        rng.fill_bytes(&mut bytes);
819        let bytes = ScalarBuffer::new(Buffer::from(bytes), 0, length.0 as usize);
820        Ok(Arc::new(
821            PrimitiveArray::<T>::new(bytes, None).with_data_type(self.data_type.clone()),
822        ))
823    }
824
825    fn data_type(&self) -> &DataType {
826        &self.data_type
827    }
828
829    fn element_size_bytes(&self) -> Option<ByteCount> {
830        Self::byte_width().map(ByteCount::from).ok()
831    }
832}
833
834// This is pretty much the same thing as RandomBinaryGenerator but we can't use that
835// because there is no ArrowPrimitiveType for FixedSizeBinary
836#[derive(Debug)]
837pub struct RandomFixedSizeBinaryGenerator {
838    data_type: DataType,
839    size: i32,
840}
841
842impl RandomFixedSizeBinaryGenerator {
843    fn new(size: i32) -> Self {
844        Self {
845            size,
846            data_type: DataType::FixedSizeBinary(size),
847        }
848    }
849}
850
851impl ArrayGenerator for RandomFixedSizeBinaryGenerator {
852    fn generate(
853        &mut self,
854        length: RowCount,
855        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
856    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
857        let num_bytes = length.0 * self.size as u64;
858        let mut bytes = vec![0; num_bytes as usize];
859        rng.fill_bytes(&mut bytes);
860        Ok(Arc::new(FixedSizeBinaryArray::new(
861            self.size,
862            Buffer::from(bytes),
863            None,
864        )))
865    }
866
867    fn data_type(&self) -> &DataType {
868        &self.data_type
869    }
870
871    fn element_size_bytes(&self) -> Option<ByteCount> {
872        Some(ByteCount::from(self.size as u64))
873    }
874}
875
876#[derive(Debug)]
877pub struct RandomIntervalGenerator {
878    unit: IntervalUnit,
879    data_type: DataType,
880}
881
882impl RandomIntervalGenerator {
883    pub fn new(unit: IntervalUnit) -> Self {
884        Self {
885            unit,
886            data_type: DataType::Interval(unit),
887        }
888    }
889}
890
891impl ArrayGenerator for RandomIntervalGenerator {
892    fn generate(
893        &mut self,
894        length: RowCount,
895        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
896    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
897        match self.unit {
898            IntervalUnit::YearMonth => {
899                let months = (0..length.0)
900                    .map(|_| rng.random::<i32>())
901                    .collect::<Vec<_>>();
902                Ok(Arc::new(arrow_array::IntervalYearMonthArray::from(months)))
903            }
904            IntervalUnit::MonthDayNano => {
905                let day_time_array = (0..length.0)
906                    .map(|_| IntervalMonthDayNano::new(rng.random(), rng.random(), rng.random()))
907                    .collect::<Vec<_>>();
908                Ok(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(
909                    day_time_array,
910                )))
911            }
912            IntervalUnit::DayTime => {
913                let day_time_array = (0..length.0)
914                    .map(|_| IntervalDayTime::new(rng.random(), rng.random()))
915                    .collect::<Vec<_>>();
916                Ok(Arc::new(arrow_array::IntervalDayTimeArray::from(
917                    day_time_array,
918                )))
919            }
920        }
921    }
922
923    fn data_type(&self) -> &DataType {
924        &self.data_type
925    }
926
927    fn element_size_bytes(&self) -> Option<ByteCount> {
928        Some(ByteCount::from(12))
929    }
930}
931#[derive(Debug)]
932pub struct RandomBinaryGenerator {
933    bytes_per_element: ByteCount,
934    scale_to_utf8: bool,
935    is_large: bool,
936    data_type: DataType,
937}
938
939impl RandomBinaryGenerator {
940    pub fn new(bytes_per_element: ByteCount, scale_to_utf8: bool, is_large: bool) -> Self {
941        Self {
942            bytes_per_element,
943            scale_to_utf8,
944            is_large,
945            data_type: match (scale_to_utf8, is_large) {
946                (false, false) => DataType::Binary,
947                (false, true) => DataType::LargeBinary,
948                (true, false) => DataType::Utf8,
949                (true, true) => DataType::LargeUtf8,
950            },
951        }
952    }
953}
954
955impl ArrayGenerator for RandomBinaryGenerator {
956    fn generate(
957        &mut self,
958        length: RowCount,
959        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
960    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
961        let mut bytes = vec![0; (self.bytes_per_element.0 * length.0) as usize];
962        rng.fill_bytes(&mut bytes);
963        if self.scale_to_utf8 {
964            // This doesn't give us the full UTF-8 range and it isn't statistically correct but
965            // it's fast and probably good enough for most cases
966            bytes = bytes.into_iter().map(|val| (val % 95) + 32).collect();
967        }
968        let bytes = Buffer::from(bytes);
969        if self.is_large {
970            let offsets = OffsetBuffer::from_lengths(iter::repeat_n(
971                self.bytes_per_element.0 as usize,
972                length.0 as usize,
973            ));
974            if self.scale_to_utf8 {
975                // This is safe because we are only using printable characters
976                unsafe {
977                    Ok(Arc::new(arrow_array::LargeStringArray::new_unchecked(
978                        offsets, bytes, None,
979                    )))
980                }
981            } else {
982                unsafe {
983                    Ok(Arc::new(arrow_array::LargeBinaryArray::new_unchecked(
984                        offsets, bytes, None,
985                    )))
986                }
987            }
988        } else {
989            let offsets = OffsetBuffer::from_lengths(iter::repeat_n(
990                self.bytes_per_element.0 as usize,
991                length.0 as usize,
992            ));
993            if self.scale_to_utf8 {
994                // This is safe because we are only using printable characters
995                unsafe {
996                    Ok(Arc::new(arrow_array::StringArray::new_unchecked(
997                        offsets, bytes, None,
998                    )))
999                }
1000            } else {
1001                unsafe {
1002                    Ok(Arc::new(arrow_array::BinaryArray::new_unchecked(
1003                        offsets, bytes, None,
1004                    )))
1005                }
1006            }
1007        }
1008    }
1009
1010    fn data_type(&self) -> &DataType {
1011        &self.data_type
1012    }
1013
1014    fn element_size_bytes(&self) -> Option<ByteCount> {
1015        // Not exactly correct since there are N + 1 4-byte offsets and this only counts N
1016        Some(ByteCount::from(
1017            self.bytes_per_element.0 + std::mem::size_of::<i32>() as u64,
1018        ))
1019    }
1020}
1021
1022/// Generate a sequence of strings with a prefix and a counter
1023///
1024/// For example, if the prefix is "user_" the strings will be "user_0", "user_1", ...
1025#[derive(Debug)]
1026pub struct PrefixPlusCounterGenerator {
1027    prefix: String,
1028    is_large: bool,
1029    data_type: DataType,
1030    current_counter: u64,
1031}
1032
1033impl PrefixPlusCounterGenerator {
1034    pub fn new(prefix: String, is_large: bool) -> Self {
1035        Self {
1036            prefix,
1037            is_large,
1038            data_type: if is_large {
1039                DataType::LargeUtf8
1040            } else {
1041                DataType::Utf8
1042            },
1043            current_counter: 0,
1044        }
1045    }
1046
1047    fn generate_values<T: OffsetSizeTrait>(
1048        &self,
1049        start: u64,
1050        num_values: u64,
1051    ) -> Result<Arc<dyn Array>, ArrowError> {
1052        let max_counter = start + num_values;
1053        let max_digits_per_counter = (max_counter as f64).log10().ceil() as u64;
1054        let max_bytes_per_str = max_digits_per_counter + self.prefix.len() as u64;
1055        let max_bytes = max_bytes_per_str * num_values;
1056        let mut builder =
1057            GenericStringBuilder::<T>::with_capacity(num_values as usize, max_bytes as usize);
1058        let mut word = String::with_capacity(max_bytes_per_str as usize);
1059        word.push_str(&self.prefix);
1060        for i in 0..num_values {
1061            let counter = start + i;
1062            word.truncate(self.prefix.len());
1063            word.push_str(&counter.to_string());
1064            builder.append_value(&word);
1065        }
1066        Ok(Arc::new(builder.finish()))
1067    }
1068}
1069
1070impl ArrayGenerator for PrefixPlusCounterGenerator {
1071    fn generate(
1072        &mut self,
1073        length: RowCount,
1074        _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1075    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1076        let start = self.current_counter;
1077        self.current_counter += length.0;
1078        if self.is_large {
1079            self.generate_values::<i64>(start, length.0)
1080        } else {
1081            self.generate_values::<i32>(start, length.0)
1082        }
1083    }
1084
1085    fn data_type(&self) -> &DataType {
1086        &self.data_type
1087    }
1088
1089    fn element_size_bytes(&self) -> Option<ByteCount> {
1090        // It's not consistent
1091        None
1092    }
1093}
1094
1095/// Generate a sequence of binary strings with a prefix and a counter
1096///
1097/// The counter will be encoded (little-endian) as a u8, u16, u32, or u64 and added to the prefix
1098/// As long as more than 256 values are generated then the resulting array will have
1099/// variable width
1100#[derive(Debug)]
1101pub struct BinaryPrefixPlusCounterGenerator {
1102    prefix: Arc<[u8]>,
1103    is_large: bool,
1104    data_type: DataType,
1105    current_counter: u64,
1106}
1107
1108impl BinaryPrefixPlusCounterGenerator {
1109    pub fn new(prefix: Arc<[u8]>, is_large: bool) -> Self {
1110        Self {
1111            prefix,
1112            is_large,
1113            data_type: if is_large {
1114                DataType::LargeBinary
1115            } else {
1116                DataType::Binary
1117            },
1118            current_counter: 0,
1119        }
1120    }
1121
1122    fn generate_values<T: OffsetSizeTrait>(
1123        &self,
1124        start: u64,
1125        num_values: u64,
1126    ) -> Result<Arc<dyn Array>, ArrowError> {
1127        let max_bytes = (self.prefix.len() + std::mem::size_of::<u64>()) * num_values as usize;
1128        let mut builder = GenericBinaryBuilder::<T>::with_capacity(num_values as usize, max_bytes);
1129        let mut word = Vec::with_capacity(self.prefix.len() + std::mem::size_of::<u64>());
1130        word.extend_from_slice(&self.prefix);
1131        for i in 0..num_values {
1132            let counter = start + i;
1133            word.truncate(self.prefix.len());
1134            if counter < u8::MAX as u64 {
1135                word.push(counter as u8);
1136            } else if counter < u16::MAX as u64 {
1137                word.extend_from_slice(&(counter as u16).to_le_bytes());
1138            } else if counter < u32::MAX as u64 {
1139                word.extend_from_slice(&(counter as u32).to_le_bytes());
1140            } else {
1141                word.extend_from_slice(&counter.to_le_bytes());
1142            }
1143            builder.append_value(&word);
1144        }
1145        Ok(Arc::new(builder.finish()))
1146    }
1147}
1148
1149impl ArrayGenerator for BinaryPrefixPlusCounterGenerator {
1150    fn generate(
1151        &mut self,
1152        length: RowCount,
1153        _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1154    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1155        let start = self.current_counter;
1156        self.current_counter += length.0;
1157        if self.is_large {
1158            self.generate_values::<i64>(start, length.0)
1159        } else {
1160            self.generate_values::<i32>(start, length.0)
1161        }
1162    }
1163
1164    fn data_type(&self) -> &DataType {
1165        &self.data_type
1166    }
1167
1168    fn element_size_bytes(&self) -> Option<ByteCount> {
1169        // It's not consistent
1170        None
1171    }
1172}
1173
1174// Common English stop words placed at the front to be sampled more frequently.
1175const STOP_WORDS: &[&str] = &[
1176    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
1177    "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
1178    "they", "this", "to", "was", "will", "with",
1179];
1180
1181const ENGLISH_WORDS: &[&str] = &[
1182    "ability",
1183    "able",
1184    "about",
1185    "above",
1186    "accept",
1187    "access",
1188    "account",
1189    "across",
1190    "action",
1191    "active",
1192    "activity",
1193    "actual",
1194    "address",
1195    "adjust",
1196    "admin",
1197    "advance",
1198    "agent",
1199    "align",
1200    "allow",
1201    "amount",
1202    "analysis",
1203    "answer",
1204    "application",
1205    "archive",
1206    "array",
1207    "asset",
1208    "async",
1209    "attribute",
1210    "available",
1211    "balance",
1212    "batch",
1213    "binary",
1214    "bitmap",
1215    "block",
1216    "branch",
1217    "buffer",
1218    "build",
1219    "cache",
1220    "capacity",
1221    "catalog",
1222    "change",
1223    "chunk",
1224    "client",
1225    "cluster",
1226    "column",
1227    "commit",
1228    "common",
1229    "compare",
1230    "compile",
1231    "compute",
1232    "condition",
1233    "config",
1234    "connect",
1235    "content",
1236    "context",
1237    "control",
1238    "convert",
1239    "copy",
1240    "core",
1241    "count",
1242    "create",
1243    "current",
1244    "cursor",
1245    "data",
1246    "dataset",
1247    "decode",
1248    "default",
1249    "delete",
1250    "delta",
1251    "depend",
1252    "derive",
1253    "design",
1254    "detail",
1255    "detect",
1256    "device",
1257    "direct",
1258    "display",
1259    "document",
1260    "domain",
1261    "drive",
1262    "dynamic",
1263    "encode",
1264    "engine",
1265    "error",
1266    "event",
1267    "example",
1268    "execute",
1269    "expand",
1270    "expect",
1271    "export",
1272    "extend",
1273    "feature",
1274    "field",
1275    "filter",
1276    "final",
1277    "finish",
1278    "format",
1279    "fragment",
1280    "future",
1281    "generate",
1282    "global",
1283    "group",
1284    "handle",
1285    "header",
1286    "index",
1287    "input",
1288    "insert",
1289    "inspect",
1290    "instance",
1291    "integer",
1292    "internal",
1293    "item",
1294    "join",
1295    "kernel",
1296    "large",
1297    "layer",
1298    "layout",
1299    "length",
1300    "level",
1301    "limit",
1302    "linear",
1303    "local",
1304    "logical",
1305    "lookup",
1306    "manage",
1307    "manifest",
1308    "memory",
1309    "merge",
1310    "metric",
1311    "model",
1312    "module",
1313    "namespace",
1314    "native",
1315    "node",
1316    "normal",
1317    "number",
1318    "object",
1319    "offset",
1320    "option",
1321    "output",
1322    "package",
1323    "page",
1324    "parallel",
1325    "parse",
1326    "partition",
1327    "pattern",
1328    "physical",
1329    "plan",
1330    "policy",
1331    "prefix",
1332    "prepare",
1333    "primary",
1334    "process",
1335    "profile",
1336    "project",
1337    "property",
1338    "query",
1339    "range",
1340    "reader",
1341    "record",
1342    "region",
1343    "registry",
1344    "request",
1345    "resolve",
1346    "resource",
1347    "result",
1348    "return",
1349    "row",
1350    "runtime",
1351    "scalar",
1352    "scan",
1353    "schema",
1354    "search",
1355    "segment",
1356    "select",
1357    "session",
1358    "setting",
1359    "source",
1360    "stable",
1361    "stage",
1362    "state",
1363    "static",
1364    "storage",
1365    "stream",
1366    "string",
1367    "struct",
1368    "table",
1369    "target",
1370    "task",
1371    "thread",
1372    "token",
1373    "trace",
1374    "transform",
1375    "type",
1376    "update",
1377    "upload",
1378    "value",
1379    "vector",
1380    "version",
1381    "view",
1382    "write",
1383    "writer",
1384];
1385
1386/// Word list with stop words at the front for Zipf sampling, computed once.
1387static SENTENCE_WORDS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
1388    let mut words = Vec::with_capacity(STOP_WORDS.len() + ENGLISH_WORDS.len());
1389    words.extend(STOP_WORDS.iter().copied());
1390    words.extend(ENGLISH_WORDS.iter().copied());
1391    words
1392});
1393
1394struct RandomSentenceGenerator {
1395    min_words: usize,
1396    max_words: usize,
1397    /// Zipf distribution for word selection (favors lower indices)
1398    zipf: Zipf<f64>,
1399    is_large: bool,
1400}
1401
1402impl std::fmt::Debug for RandomSentenceGenerator {
1403    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1404        f.debug_struct("RandomSentenceGenerator")
1405            .field("min_words", &self.min_words)
1406            .field("max_words", &self.max_words)
1407            .field("num_words", &SENTENCE_WORDS.len())
1408            .field("is_large", &self.is_large)
1409            .finish()
1410    }
1411}
1412
1413impl RandomSentenceGenerator {
1414    pub fn new(min_words: usize, max_words: usize, is_large: bool) -> Self {
1415        // Zipf distribution with exponent ~1.0 approximates natural language
1416        let zipf = Zipf::new(SENTENCE_WORDS.len() as f64, 1.0).unwrap();
1417
1418        Self {
1419            min_words,
1420            max_words,
1421            zipf,
1422            is_large,
1423        }
1424    }
1425}
1426
1427impl ArrayGenerator for RandomSentenceGenerator {
1428    fn generate(
1429        &mut self,
1430        length: RowCount,
1431        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1432    ) -> Result<Arc<dyn Array>, ArrowError> {
1433        let mut values = Vec::with_capacity(length.0 as usize);
1434
1435        for _ in 0..length.0 {
1436            let num_words = rng.random_range(self.min_words..=self.max_words);
1437            let sentence: String = (0..num_words)
1438                .map(|_| {
1439                    // Zipf returns 1-indexed values, subtract 1 for 0-indexed array
1440                    let idx = rng.sample(self.zipf) as usize - 1;
1441                    SENTENCE_WORDS[idx]
1442                })
1443                .collect::<Vec<_>>()
1444                .join(" ");
1445            values.push(sentence);
1446        }
1447
1448        if self.is_large {
1449            Ok(Arc::new(LargeStringArray::from(values)))
1450        } else {
1451            Ok(Arc::new(StringArray::from(values)))
1452        }
1453    }
1454
1455    fn data_type(&self) -> &DataType {
1456        if self.is_large {
1457            &DataType::LargeUtf8
1458        } else {
1459            &DataType::Utf8
1460        }
1461    }
1462
1463    fn element_size_bytes(&self) -> Option<ByteCount> {
1464        // Estimate average word length as 5, plus space
1465        // See https://arxiv.org/pdf/1208.6109
1466        let avg_word_length = 6;
1467        let avg_words = (self.min_words + self.max_words) / 2;
1468        Some(ByteCount::from((avg_word_length * avg_words) as u64))
1469    }
1470}
1471
1472#[derive(Debug)]
1473struct RandomWordGenerator {
1474    words: &'static [&'static str],
1475    is_large: bool,
1476}
1477
1478impl RandomWordGenerator {
1479    pub fn new(is_large: bool) -> Self {
1480        let words = ENGLISH_WORDS;
1481        Self { words, is_large }
1482    }
1483}
1484
1485impl ArrayGenerator for RandomWordGenerator {
1486    fn generate(
1487        &mut self,
1488        length: RowCount,
1489        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1490    ) -> Result<Arc<dyn Array>, ArrowError> {
1491        let mut values = Vec::with_capacity(length.0 as usize);
1492
1493        for _ in 0..length.0 {
1494            let word = self.words[rng.random_range(0..self.words.len())];
1495            values.push(word.to_string());
1496        }
1497
1498        if self.is_large {
1499            Ok(Arc::new(LargeStringArray::from(values)))
1500        } else {
1501            Ok(Arc::new(StringArray::from(values)))
1502        }
1503    }
1504
1505    fn data_type(&self) -> &DataType {
1506        if self.is_large {
1507            &DataType::LargeUtf8
1508        } else {
1509            &DataType::Utf8
1510        }
1511    }
1512
1513    fn element_size_bytes(&self) -> Option<ByteCount> {
1514        // Average English word length is ~5 characters
1515        Some(ByteCount::from(5))
1516    }
1517}
1518
1519#[derive(Debug)]
1520pub struct VariableRandomBinaryGenerator {
1521    lengths_gen: Box<dyn ArrayGenerator>,
1522    data_type: DataType,
1523}
1524
1525impl VariableRandomBinaryGenerator {
1526    pub fn new(min_bytes_per_element: ByteCount, max_bytes_per_element: ByteCount) -> Self {
1527        let lengths_dist = Uniform::new_inclusive(
1528            min_bytes_per_element.0 as i32,
1529            max_bytes_per_element.0 as i32,
1530        )
1531        .unwrap();
1532        let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist);
1533
1534        Self {
1535            lengths_gen,
1536            data_type: DataType::Binary,
1537        }
1538    }
1539}
1540
1541impl ArrayGenerator for VariableRandomBinaryGenerator {
1542    fn generate(
1543        &mut self,
1544        length: RowCount,
1545        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1546    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1547        let lengths = self.lengths_gen.generate(length, rng)?;
1548        let lengths = lengths.as_primitive::<Int32Type>();
1549        let total_length = lengths.values().iter().map(|i| *i as usize).sum::<usize>();
1550        let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1551        let mut bytes = vec![0; total_length];
1552        rng.fill_bytes(&mut bytes);
1553        let bytes = Buffer::from(bytes);
1554        Ok(Arc::new(BinaryArray::try_new(offsets, bytes, None)?))
1555    }
1556
1557    fn data_type(&self) -> &DataType {
1558        &self.data_type
1559    }
1560
1561    fn element_size_bytes(&self) -> Option<ByteCount> {
1562        None
1563    }
1564}
1565
1566pub struct CycleBinaryGenerator<T: ByteArrayType> {
1567    values: Vec<u8>,
1568    lengths: Vec<usize>,
1569    data_type: DataType,
1570    array_type: PhantomData<T>,
1571    width: Option<ByteCount>,
1572    idx: usize,
1573}
1574
1575impl<T: ByteArrayType> std::fmt::Debug for CycleBinaryGenerator<T> {
1576    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1577        f.debug_struct("CycleBinaryGenerator")
1578            .field("values", &self.values)
1579            .field("lengths", &self.lengths)
1580            .field("data_type", &self.data_type)
1581            .field("width", &self.width)
1582            .field("idx", &self.idx)
1583            .finish()
1584    }
1585}
1586
1587impl<T: ByteArrayType> CycleBinaryGenerator<T> {
1588    pub fn from_strings(values: &[&str]) -> Self {
1589        if values.is_empty() {
1590            panic!("Attempt to create a cycle generator with no values");
1591        }
1592        let lengths = values.iter().map(|s| s.len()).collect::<Vec<_>>();
1593        let typical_length = lengths[0];
1594        let width = if lengths.iter().all(|item| *item == typical_length) {
1595            Some(ByteCount::from(
1596                typical_length as u64 + std::mem::size_of::<i32>() as u64,
1597            ))
1598        } else {
1599            None
1600        };
1601        let values = values
1602            .iter()
1603            .flat_map(|s| s.as_bytes().iter().copied())
1604            .collect::<Vec<_>>();
1605        Self {
1606            values,
1607            lengths,
1608            data_type: T::DATA_TYPE,
1609            array_type: PhantomData,
1610            width,
1611            idx: 0,
1612        }
1613    }
1614}
1615
1616impl<T: ByteArrayType> ArrayGenerator for CycleBinaryGenerator<T> {
1617    fn generate(
1618        &mut self,
1619        length: RowCount,
1620        _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1621    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1622        let lengths = self
1623            .lengths
1624            .iter()
1625            .copied()
1626            .cycle()
1627            .skip(self.idx)
1628            .take(length.0 as usize);
1629        let num_bytes = lengths.clone().sum();
1630        let byte_offset = self.lengths[0..self.idx].iter().sum();
1631        let bytes = self
1632            .values
1633            .iter()
1634            .cycle()
1635            .skip(byte_offset)
1636            .copied()
1637            .take(num_bytes)
1638            .collect::<Vec<_>>();
1639        let bytes = Buffer::from(bytes);
1640        let offsets = OffsetBuffer::from_lengths(lengths);
1641        self.idx = (self.idx + length.0 as usize) % self.lengths.len();
1642        Ok(Arc::new(arrow_array::GenericByteArray::<T>::new(
1643            offsets, bytes, None,
1644        )))
1645    }
1646
1647    fn data_type(&self) -> &DataType {
1648        &self.data_type
1649    }
1650
1651    fn element_size_bytes(&self) -> Option<ByteCount> {
1652        self.width
1653    }
1654}
1655
1656pub struct FixedBinaryGenerator<T: ByteArrayType> {
1657    value: Vec<u8>,
1658    data_type: DataType,
1659    array_type: PhantomData<T>,
1660}
1661
1662impl<T: ByteArrayType> std::fmt::Debug for FixedBinaryGenerator<T> {
1663    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1664        f.debug_struct("FixedBinaryGenerator")
1665            .field("value", &self.value)
1666            .field("data_type", &self.data_type)
1667            .finish()
1668    }
1669}
1670
1671impl<T: ByteArrayType> FixedBinaryGenerator<T> {
1672    pub fn new(value: Vec<u8>) -> Self {
1673        Self {
1674            value,
1675            data_type: T::DATA_TYPE,
1676            array_type: PhantomData,
1677        }
1678    }
1679}
1680
1681impl<T: ByteArrayType> ArrayGenerator for FixedBinaryGenerator<T> {
1682    fn generate(
1683        &mut self,
1684        length: RowCount,
1685        _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1686    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1687        let bytes = Buffer::from(Vec::from_iter(
1688            self.value
1689                .iter()
1690                .cycle()
1691                .take((length.0 * self.value.len() as u64) as usize)
1692                .copied(),
1693        ));
1694        let offsets =
1695            OffsetBuffer::from_lengths(iter::repeat_n(self.value.len(), length.0 as usize));
1696        Ok(Arc::new(arrow_array::GenericByteArray::<T>::new(
1697            offsets, bytes, None,
1698        )))
1699    }
1700
1701    fn data_type(&self) -> &DataType {
1702        &self.data_type
1703    }
1704
1705    fn element_size_bytes(&self) -> Option<ByteCount> {
1706        // Not exactly correct since there are N + 1 4-byte offsets and this only counts N
1707        Some(ByteCount::from(
1708            self.value.len() as u64 + std::mem::size_of::<i32>() as u64,
1709        ))
1710    }
1711}
1712
1713pub struct DictionaryGenerator<K: ArrowDictionaryKeyType> {
1714    generator: Box<dyn ArrayGenerator>,
1715    data_type: DataType,
1716    key_type: PhantomData<K>,
1717    key_width: u64,
1718}
1719
1720impl<K: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryGenerator<K> {
1721    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1722        f.debug_struct("DictionaryGenerator")
1723            .field("generator", &self.generator)
1724            .field("data_type", &self.data_type)
1725            .field("key_width", &self.key_width)
1726            .finish()
1727    }
1728}
1729
1730impl<K: ArrowDictionaryKeyType> DictionaryGenerator<K> {
1731    fn new(generator: Box<dyn ArrayGenerator>) -> Self {
1732        let key_type = Box::new(K::DATA_TYPE);
1733        let key_width = key_type
1734            .primitive_width()
1735            .expect("dictionary key types should have a known width")
1736            as u64;
1737        let val_type = Box::new(generator.data_type().clone());
1738        let dict_type = DataType::Dictionary(key_type, val_type);
1739        Self {
1740            generator,
1741            data_type: dict_type,
1742            key_type: PhantomData,
1743            key_width,
1744        }
1745    }
1746}
1747
1748impl<K: ArrowDictionaryKeyType + Send + Sync> ArrayGenerator for DictionaryGenerator<K> {
1749    fn generate(
1750        &mut self,
1751        length: RowCount,
1752        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1753    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1754        let underlying = self.generator.generate(length, rng)?;
1755        arrow_cast::cast::cast(&underlying, &self.data_type)
1756    }
1757
1758    fn data_type(&self) -> &DataType {
1759        &self.data_type
1760    }
1761
1762    fn element_size_bytes(&self) -> Option<ByteCount> {
1763        self.generator
1764            .element_size_bytes()
1765            .map(|size_bytes| ByteCount::from(size_bytes.0 + self.key_width))
1766    }
1767}
1768
1769/// Generator that produces low-cardinality data by generating a fixed set of
1770/// unique values and then randomly selecting from them.
1771struct LowCardinalityGenerator {
1772    inner: Box<dyn ArrayGenerator>,
1773    cardinality: usize,
1774    /// Cached unique values, generated on first call
1775    unique_values: Option<Arc<dyn Array>>,
1776}
1777
1778impl std::fmt::Debug for LowCardinalityGenerator {
1779    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1780        f.debug_struct("LowCardinalityGenerator")
1781            .field("inner", &self.inner)
1782            .field("cardinality", &self.cardinality)
1783            .field("initialized", &self.unique_values.is_some())
1784            .finish()
1785    }
1786}
1787
1788impl LowCardinalityGenerator {
1789    fn new(inner: Box<dyn ArrayGenerator>, cardinality: usize) -> Self {
1790        Self {
1791            inner,
1792            cardinality,
1793            unique_values: None,
1794        }
1795    }
1796}
1797
1798impl ArrayGenerator for LowCardinalityGenerator {
1799    fn generate(
1800        &mut self,
1801        length: RowCount,
1802        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1803    ) -> Result<Arc<dyn Array>, ArrowError> {
1804        // Generate unique values on first call
1805        if self.unique_values.is_none() {
1806            self.unique_values = Some(
1807                self.inner
1808                    .generate(RowCount::from(self.cardinality as u64), rng)?,
1809            );
1810        }
1811
1812        let unique_values = self.unique_values.as_ref().unwrap();
1813
1814        // Generate random indices into the unique values
1815        let indices: Vec<usize> = (0..length.0)
1816            .map(|_| rng.random_range(0..self.cardinality))
1817            .collect();
1818
1819        // Use arrow's take to select values
1820        let indices_array =
1821            arrow_array::UInt32Array::from(indices.iter().map(|&i| i as u32).collect::<Vec<_>>());
1822        arrow::compute::take(unique_values.as_ref(), &indices_array, None)
1823            .map(|arr| arr as Arc<dyn Array>)
1824    }
1825
1826    fn data_type(&self) -> &DataType {
1827        self.inner.data_type()
1828    }
1829
1830    fn element_size_bytes(&self) -> Option<ByteCount> {
1831        self.inner.element_size_bytes()
1832    }
1833}
1834
1835#[derive(Debug)]
1836struct RandomListGenerator {
1837    field: Arc<Field>,
1838    child_field: Arc<Field>,
1839    items_gen: Box<dyn ArrayGenerator>,
1840    lengths_gen: Box<dyn ArrayGenerator>,
1841    is_large: bool,
1842}
1843
1844impl RandomListGenerator {
1845    // Creates a list generator that generates random lists with lengths between 0 and 10 (inclusive)
1846    fn new(items_gen: Box<dyn ArrayGenerator>, is_large: bool) -> Self {
1847        let child_field = Arc::new(Field::new("item", items_gen.data_type().clone(), true));
1848        let list_type = if is_large {
1849            DataType::LargeList(child_field.clone())
1850        } else {
1851            DataType::List(child_field.clone())
1852        };
1853        let field = Field::new("", list_type, true);
1854        let lengths_gen = if is_large {
1855            let lengths_dist = Uniform::new_inclusive(0, 10).unwrap();
1856            rand_with_distribution::<Int64Type, Uniform<i64>>(lengths_dist)
1857        } else {
1858            let lengths_dist = Uniform::new_inclusive(0, 10).unwrap();
1859            rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist)
1860        };
1861        Self {
1862            field: Arc::new(field),
1863            child_field,
1864            items_gen,
1865            lengths_gen,
1866            is_large,
1867        }
1868    }
1869}
1870
1871impl ArrayGenerator for RandomListGenerator {
1872    fn generate(
1873        &mut self,
1874        length: RowCount,
1875        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1876    ) -> Result<Arc<dyn Array>, ArrowError> {
1877        let lengths = self.lengths_gen.generate(length, rng)?;
1878        if self.is_large {
1879            let lengths = lengths.as_primitive::<Int64Type>();
1880            let total_length = lengths.values().iter().sum::<i64>() as u64;
1881            let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1882            let items = self.items_gen.generate(RowCount::from(total_length), rng)?;
1883            Ok(Arc::new(LargeListArray::try_new(
1884                self.child_field.clone(),
1885                offsets,
1886                items,
1887                None,
1888            )?))
1889        } else {
1890            let lengths = lengths.as_primitive::<Int32Type>();
1891            let total_length = lengths.values().iter().sum::<i32>() as u64;
1892            let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1893            let items = self.items_gen.generate(RowCount::from(total_length), rng)?;
1894            Ok(Arc::new(ListArray::try_new(
1895                self.child_field.clone(),
1896                offsets,
1897                items,
1898                None,
1899            )?))
1900        }
1901    }
1902
1903    fn data_type(&self) -> &DataType {
1904        self.field.data_type()
1905    }
1906
1907    fn element_size_bytes(&self) -> Option<ByteCount> {
1908        None
1909    }
1910}
1911
1912/// Generates random map arrays where each map has 0-4 entries.
1913#[derive(Debug)]
1914struct RandomMapGenerator {
1915    field: Arc<Field>,
1916    entries_field: Arc<Field>,
1917    keys_gen: Box<dyn ArrayGenerator>,
1918    values_gen: Box<dyn ArrayGenerator>,
1919    lengths_gen: Box<dyn ArrayGenerator>,
1920}
1921
1922impl RandomMapGenerator {
1923    fn new(keys_gen: Box<dyn ArrayGenerator>, values_gen: Box<dyn ArrayGenerator>) -> Self {
1924        let entries_fields = Fields::from(vec![
1925            Field::new("keys", keys_gen.data_type().clone(), false),
1926            Field::new("values", values_gen.data_type().clone(), true),
1927        ]);
1928        let entries_field = Arc::new(Field::new(
1929            "entries",
1930            DataType::Struct(entries_fields),
1931            false,
1932        ));
1933        let map_type = DataType::Map(entries_field.clone(), false);
1934        let field = Arc::new(Field::new("", map_type, true));
1935        let lengths_dist = Uniform::new_inclusive(0_i32, 4).unwrap();
1936        let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist);
1937
1938        Self {
1939            field,
1940            entries_field,
1941            keys_gen,
1942            values_gen,
1943            lengths_gen,
1944        }
1945    }
1946}
1947
1948impl ArrayGenerator for RandomMapGenerator {
1949    fn generate(
1950        &mut self,
1951        length: RowCount,
1952        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1953    ) -> Result<Arc<dyn Array>, ArrowError> {
1954        let lengths = self.lengths_gen.generate(length, rng)?;
1955        let lengths = lengths.as_primitive::<Int32Type>();
1956        let total_entries = lengths.values().iter().sum::<i32>() as u64;
1957        let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1958
1959        let keys = self.keys_gen.generate(RowCount::from(total_entries), rng)?;
1960        let values = self
1961            .values_gen
1962            .generate(RowCount::from(total_entries), rng)?;
1963
1964        let entries = StructArray::new(
1965            Fields::from(vec![
1966                Field::new("keys", keys.data_type().clone(), false),
1967                Field::new("values", values.data_type().clone(), true),
1968            ]),
1969            vec![keys, values],
1970            None,
1971        );
1972
1973        Ok(Arc::new(MapArray::try_new(
1974            self.entries_field.clone(),
1975            offsets,
1976            entries,
1977            None,
1978            false,
1979        )?))
1980    }
1981
1982    fn data_type(&self) -> &DataType {
1983        self.field.data_type()
1984    }
1985
1986    fn element_size_bytes(&self) -> Option<ByteCount> {
1987        None
1988    }
1989}
1990
1991#[derive(Debug)]
1992struct NullArrayGenerator {}
1993
1994impl ArrayGenerator for NullArrayGenerator {
1995    fn generate(
1996        &mut self,
1997        length: RowCount,
1998        _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1999    ) -> Result<Arc<dyn Array>, ArrowError> {
2000        Ok(Arc::new(NullArray::new(length.0 as usize)))
2001    }
2002
2003    fn data_type(&self) -> &DataType {
2004        &DataType::Null
2005    }
2006
2007    fn element_size_bytes(&self) -> Option<ByteCount> {
2008        None
2009    }
2010}
2011
2012/// Generates 2 dimensional vectors along the unit circle, with a configurable number of steps per circle.
2013#[derive(Debug)]
2014struct RadialStepGenerator {
2015    num_steps_per_circle: u32,
2016    data_field: Arc<Field>,
2017    data_type: DataType,
2018    current_step: u32,
2019}
2020
2021impl RadialStepGenerator {
2022    fn new(num_steps_per_circle: u32) -> Self {
2023        let data_field = Arc::new(Field::new("item", DataType::Float32, false));
2024        let data_type = DataType::FixedSizeList(data_field.clone(), 2);
2025        Self {
2026            num_steps_per_circle,
2027            data_field,
2028            data_type,
2029            current_step: 0,
2030        }
2031    }
2032}
2033
2034impl ArrayGenerator for RadialStepGenerator {
2035    fn generate(
2036        &mut self,
2037        length: RowCount,
2038        _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
2039    ) -> Result<Arc<dyn Array>, ArrowError> {
2040        let mut values_builder = Float32Builder::with_capacity(length.0 as usize * 2);
2041        for _ in 0..length.0 {
2042            let angle = (self.current_step as f32) / (self.num_steps_per_circle as f32)
2043                * 2.0
2044                * std::f32::consts::PI;
2045            values_builder.append_value(angle.cos());
2046            values_builder.append_value(angle.sin());
2047            self.current_step = (self.current_step + 1) % self.num_steps_per_circle;
2048        }
2049        let values = values_builder.finish();
2050        let vectors =
2051            FixedSizeListArray::try_new(self.data_field.clone(), 2, Arc::new(values), None)?;
2052        Ok(Arc::new(vectors))
2053    }
2054
2055    fn data_type(&self) -> &DataType {
2056        &self.data_type
2057    }
2058
2059    fn element_size_bytes(&self) -> Option<ByteCount> {
2060        Some(ByteCount::from(8))
2061    }
2062}
2063
2064/// Cycles through a set of centroids, adding noise to each point
2065#[derive(Debug)]
2066struct JitterCentroidsGenerator {
2067    centroids: Float32Array,
2068    dimension: u32,
2069    noise_level: f32,
2070    data_type: DataType,
2071    data_field: Arc<Field>,
2072
2073    offset: usize,
2074}
2075
2076impl JitterCentroidsGenerator {
2077    fn try_new(centroids: Arc<dyn Array>, noise_level: f32) -> Result<Self, ArrowError> {
2078        let DataType::FixedSizeList(values_field, dimension) = centroids.data_type() else {
2079            return Err(ArrowError::InvalidArgumentError(
2080                "Centroids must be a FixedSizeList".to_string(),
2081            ));
2082        };
2083        if values_field.data_type() != &DataType::Float32 {
2084            return Err(ArrowError::InvalidArgumentError(
2085                "Centroids values must be a Float32".to_string(),
2086            ));
2087        }
2088        let data_type = DataType::FixedSizeList(values_field.clone(), *dimension);
2089        Ok(Self {
2090            centroids: centroids
2091                .as_fixed_size_list()
2092                .values()
2093                .as_primitive::<Float32Type>()
2094                .clone(),
2095            dimension: *dimension as u32,
2096            noise_level,
2097            data_type,
2098            data_field: values_field.clone(),
2099            offset: 0,
2100        })
2101    }
2102}
2103
2104impl ArrayGenerator for JitterCentroidsGenerator {
2105    fn generate(
2106        &mut self,
2107        length: RowCount,
2108        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
2109    ) -> Result<Arc<dyn Array>, ArrowError> {
2110        let mut values_builder =
2111            Float32Builder::with_capacity(length.0 as usize * self.dimension as usize);
2112        for _ in 0..length.0 {
2113            // Generate random N dimensional point
2114            let mut noise = (0..self.dimension as usize)
2115                .map(|_| rng.random::<f32>())
2116                .collect::<Vec<_>>();
2117            // Scale point to noise_level length
2118            let scale = self.noise_level / noise.iter().map(|v| v * v).sum::<f32>().sqrt();
2119            noise.iter_mut().for_each(|v| *v *= scale);
2120
2121            // Add noise to centroid and store in values
2122            for (i, noise) in noise.into_iter().enumerate() {
2123                let centroid_val = self.centroids.value(self.offset + i);
2124                let jittered_val = centroid_val + noise;
2125                values_builder.append_value(jittered_val);
2126            }
2127            // Advance to next centroid
2128            self.offset = (self.offset + self.dimension as usize) % self.centroids.len();
2129        }
2130        let values = values_builder.finish();
2131        let vectors = FixedSizeListArray::try_new(
2132            self.data_field.clone(),
2133            self.dimension as i32,
2134            Arc::new(values),
2135            None,
2136        )?;
2137        Ok(Arc::new(vectors))
2138    }
2139
2140    fn data_type(&self) -> &DataType {
2141        &self.data_type
2142    }
2143
2144    fn element_size_bytes(&self) -> Option<ByteCount> {
2145        Some(ByteCount::from(self.dimension as u64 * 4))
2146    }
2147}
2148#[derive(Debug)]
2149struct RandomStructGenerator {
2150    fields: Fields,
2151    data_type: DataType,
2152    child_gens: Vec<Box<dyn ArrayGenerator>>,
2153}
2154
2155impl RandomStructGenerator {
2156    fn new(fields: Fields, child_gens: Vec<Box<dyn ArrayGenerator>>) -> Self {
2157        let data_type = DataType::Struct(fields.clone());
2158        Self {
2159            fields,
2160            data_type,
2161            child_gens,
2162        }
2163    }
2164}
2165
2166impl ArrayGenerator for RandomStructGenerator {
2167    fn generate(
2168        &mut self,
2169        length: RowCount,
2170        rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
2171    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
2172        if self.child_gens.is_empty() {
2173            // Have to create empty struct arrays specially to ensure they have the correct
2174            // row count
2175            let struct_arr = StructArray::new_empty_fields(length.0 as usize, None);
2176            return Ok(Arc::new(struct_arr));
2177        }
2178        let child_arrays = self
2179            .child_gens
2180            .iter_mut()
2181            .map(|genn| genn.generate(length, rng))
2182            .collect::<Result<Vec<_>, ArrowError>>()?;
2183        let struct_arr = StructArray::new(self.fields.clone(), child_arrays, None);
2184        Ok(Arc::new(struct_arr))
2185    }
2186
2187    fn data_type(&self) -> &DataType {
2188        &self.data_type
2189    }
2190
2191    fn element_size_bytes(&self) -> Option<ByteCount> {
2192        let mut sum = 0;
2193        for child_gen in &self.child_gens {
2194            sum += child_gen.element_size_bytes()?.0;
2195        }
2196        Some(ByteCount::from(sum))
2197    }
2198}
2199
2200/// A RecordBatchReader that generates batches of the given size from the given array generators
2201pub struct FixedSizeBatchGenerator {
2202    rng: rand_xoshiro::Xoshiro256PlusPlus,
2203    generators: Vec<Box<dyn ArrayGenerator>>,
2204    batch_size: RowCount,
2205    num_batches: BatchCount,
2206    schema: SchemaRef,
2207}
2208
2209impl FixedSizeBatchGenerator {
2210    fn new(
2211        generators: Vec<(Option<String>, Box<dyn ArrayGenerator>)>,
2212        batch_size: RowCount,
2213        num_batches: BatchCount,
2214        seed: Option<Seed>,
2215        default_null_probability: Option<f64>,
2216    ) -> Self {
2217        let mut fields = Vec::with_capacity(generators.len());
2218        for (field_index, field_gen) in generators.iter().enumerate() {
2219            let (name, genn) = field_gen;
2220            let default_name = format!("field_{}", field_index);
2221            let name = name.clone().unwrap_or(default_name);
2222            let mut field = Field::new(name, genn.data_type().clone(), true);
2223            if let Some(metadata) = genn.metadata() {
2224                field = field.with_metadata(metadata);
2225            }
2226            fields.push(field);
2227        }
2228        let mut generators = generators
2229            .into_iter()
2230            .map(|(_, genn)| genn)
2231            .collect::<Vec<_>>();
2232        if let Some(null_probability) = default_null_probability {
2233            generators = generators
2234                .into_iter()
2235                .map(|genn| genn.with_random_nulls(null_probability))
2236                .collect();
2237        }
2238        let schema = Arc::new(Schema::new(fields));
2239        Self {
2240            rng: rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
2241                seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
2242            ),
2243            generators,
2244            batch_size,
2245            num_batches,
2246            schema,
2247        }
2248    }
2249
2250    fn gen_next(&mut self) -> Result<RecordBatch, ArrowError> {
2251        let mut arrays = Vec::with_capacity(self.generators.len());
2252        for genn in self.generators.iter_mut() {
2253            let arr = genn.generate(self.batch_size, &mut self.rng)?;
2254            arrays.push(arr);
2255        }
2256        self.num_batches.0 -= 1;
2257        Ok(RecordBatch::try_new_with_options(
2258            self.schema.clone(),
2259            arrays,
2260            &RecordBatchOptions::new().with_row_count(Some(self.batch_size.0 as usize)),
2261        )
2262        .unwrap())
2263    }
2264}
2265
2266impl Iterator for FixedSizeBatchGenerator {
2267    type Item = Result<RecordBatch, ArrowError>;
2268
2269    fn next(&mut self) -> Option<Self::Item> {
2270        if self.num_batches.0 == 0 {
2271            return None;
2272        }
2273        Some(self.gen_next())
2274    }
2275}
2276
2277impl RecordBatchReader for FixedSizeBatchGenerator {
2278    fn schema(&self) -> SchemaRef {
2279        self.schema.clone()
2280    }
2281}
2282
2283/// A builder to create a record batch reader with generated data
2284///
2285/// This type is meant to be used in a fluent builder style to define the schema and generators
2286/// for a record batch reader.
2287#[derive(Default)]
2288pub struct BatchGeneratorBuilder {
2289    generators: Vec<(Option<String>, Box<dyn ArrayGenerator>)>,
2290    default_null_probability: Option<f64>,
2291    seed: Option<Seed>,
2292}
2293
2294pub enum RoundingBehavior {
2295    ExactOrErr,
2296    RoundUp,
2297    RoundDown,
2298}
2299
2300impl BatchGeneratorBuilder {
2301    /// Create a new BatchGeneratorBuilder with a default random seed
2302    pub fn new() -> Self {
2303        Default::default()
2304    }
2305
2306    /// Create a new BatchGeneratorBuilder with the given seed
2307    pub fn new_with_seed(seed: Seed) -> Self {
2308        Self {
2309            seed: Some(seed),
2310            ..Default::default()
2311        }
2312    }
2313
2314    /// Adds a new column to the generator
2315    ///
2316    /// See [`crate::generator::array`] for methods to create generators
2317    pub fn col(mut self, name: impl Into<String>, genn: Box<dyn ArrayGenerator>) -> Self {
2318        self.generators.push((Some(name.into()), genn));
2319        self
2320    }
2321
2322    /// Adds a new column to the generator with a generated unique name
2323    ///
2324    /// See [`crate::generator::array`] for methods to create generators
2325    pub fn anon_col(mut self, genn: Box<dyn ArrayGenerator>) -> Self {
2326        self.generators.push((None, genn));
2327        self
2328    }
2329
2330    pub fn into_batch_rows(self, batch_size: RowCount) -> Result<RecordBatch, ArrowError> {
2331        let mut reader = self.into_reader_rows(batch_size, BatchCount::from(1));
2332        reader
2333            .next()
2334            .expect("Asked for 1 batch but reader was empty")
2335    }
2336
2337    pub fn into_batch_bytes(
2338        self,
2339        batch_size: ByteCount,
2340        rounding: RoundingBehavior,
2341    ) -> Result<RecordBatch, ArrowError> {
2342        let mut reader = self.into_reader_bytes(batch_size, BatchCount::from(1), rounding)?;
2343        reader
2344            .next()
2345            .expect("Asked for 1 batch but reader was empty")
2346    }
2347
2348    /// Create a RecordBatchReader that generates batches of the given size (in rows)
2349    pub fn into_reader_rows(
2350        self,
2351        batch_size: RowCount,
2352        num_batches: BatchCount,
2353    ) -> impl RecordBatchReader {
2354        FixedSizeBatchGenerator::new(
2355            self.generators,
2356            batch_size,
2357            num_batches,
2358            self.seed,
2359            self.default_null_probability,
2360        )
2361    }
2362
2363    pub fn into_reader_stream(
2364        self,
2365        batch_size: RowCount,
2366        num_batches: BatchCount,
2367    ) -> (
2368        BoxStream<'static, Result<RecordBatch, ArrowError>>,
2369        Arc<Schema>,
2370    ) {
2371        // TODO: this is pretty lazy and could be optimized
2372        let reader = self.into_reader_rows(batch_size, num_batches);
2373        let schema = reader.schema();
2374        let batches = reader.collect::<Vec<_>>();
2375        (futures::stream::iter(batches).boxed(), schema)
2376    }
2377
2378    /// Create a RecordBatchReader that generates batches of the given size (in bytes)
2379    pub fn into_reader_bytes(
2380        self,
2381        batch_size_bytes: ByteCount,
2382        num_batches: BatchCount,
2383        rounding: RoundingBehavior,
2384    ) -> Result<impl RecordBatchReader, ArrowError> {
2385        let bytes_per_row = self
2386            .generators
2387            .iter()
2388            .map(|genn| genn.1.element_size_bytes().map(|byte_count| byte_count.0).ok_or(
2389                        ArrowError::NotYetImplemented("The function into_reader_bytes currently requires each array generator to have a fixed element size".to_string())
2390                )
2391            )
2392            .sum::<Result<u64, ArrowError>>()?;
2393        let mut num_rows = RowCount::from(batch_size_bytes.0 / bytes_per_row);
2394        if !batch_size_bytes.0.is_multiple_of(bytes_per_row) {
2395            match rounding {
2396                RoundingBehavior::ExactOrErr => {
2397                    return Err(ArrowError::NotYetImplemented(format!(
2398                        "Exact rounding requested but not possible.  Batch size requested {}, row size: {}",
2399                        batch_size_bytes.0, bytes_per_row
2400                    )));
2401                }
2402                RoundingBehavior::RoundUp => {
2403                    num_rows = RowCount::from(num_rows.0 + 1);
2404                }
2405                RoundingBehavior::RoundDown => (),
2406            }
2407        }
2408        Ok(self.into_reader_rows(num_rows, num_batches))
2409    }
2410
2411    /// Set the seed for the generator
2412    pub fn with_seed(mut self, seed: Seed) -> Self {
2413        self.seed = Some(seed);
2414        self
2415    }
2416
2417    /// Adds nulls (with the given probability) to all columns
2418    pub fn with_random_nulls(&mut self, default_null_probability: f64) {
2419        self.default_null_probability = Some(default_null_probability);
2420    }
2421}
2422
2423/// Factory for creating a single random array
2424pub struct ArrayGeneratorBuilder {
2425    generator: Box<dyn ArrayGenerator>,
2426    seed: Option<Seed>,
2427}
2428
2429impl ArrayGeneratorBuilder {
2430    fn new(generator: Box<dyn ArrayGenerator>) -> Self {
2431        Self {
2432            generator,
2433            seed: None,
2434        }
2435    }
2436
2437    /// Use the given seed for the generator
2438    pub fn with_seed(mut self, seed: Seed) -> Self {
2439        self.seed = Some(seed);
2440        self
2441    }
2442
2443    /// Generate a single array with the given length
2444    pub fn into_array_rows(
2445        mut self,
2446        length: RowCount,
2447    ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
2448        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
2449            self.seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
2450        );
2451        self.generator.generate(length, &mut rng)
2452    }
2453}
2454
2455const MS_PER_DAY: i64 = 86400000;
2456
2457pub mod array {
2458
2459    use arrow::datatypes::{Int8Type, Int16Type, Int64Type};
2460    use arrow_array::types::{
2461        Decimal128Type, Decimal256Type, DurationMicrosecondType, DurationMillisecondType,
2462        DurationNanosecondType, DurationSecondType, Float16Type, Float32Type, Float64Type,
2463        UInt8Type, UInt16Type, UInt32Type, UInt64Type,
2464    };
2465    use arrow_array::{
2466        ArrowNativeTypeOp, BooleanArray, Date32Array, Date64Array, Time32MillisecondArray,
2467        Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
2468        TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
2469        TimestampSecondArray,
2470    };
2471    use arrow_schema::{IntervalUnit, TimeUnit};
2472    use chrono::Utc;
2473    use rand::prelude::Distribution;
2474
2475    use super::*;
2476
2477    /// Create a generator of vectors by continuously calling the given generator
2478    ///
2479    /// For example, given a step generator and a dimension of 3 this will generate vectors like
2480    /// [0, 1, 2], [3, 4, 5], [6, 7, 8], ...
2481    pub fn cycle_vec(
2482        generator: Box<dyn ArrayGenerator>,
2483        dimension: Dimension,
2484    ) -> Box<dyn ArrayGenerator> {
2485        Box::new(CycleVectorGenerator::new(generator, dimension))
2486    }
2487
2488    /// Create a generator of list vectors by continuously calling the given generator
2489    ///
2490    /// The lists will have lengths uniformly distributed between `min_list_size` (inclusive) and
2491    /// `max_list_size` (exclusive).
2492    pub fn cycle_vec_var(
2493        generator: Box<dyn ArrayGenerator>,
2494        min_list_size: Dimension,
2495        max_list_size: Dimension,
2496    ) -> Box<dyn ArrayGenerator> {
2497        Box::new(CycleListGenerator::new(
2498            generator,
2499            min_list_size,
2500            max_list_size,
2501        ))
2502    }
2503
2504    /// Create a generator of vectors around unit circle
2505    ///
2506    /// Vectors will be equally spaced around the unit circle so that there are num_steps
2507    /// vectors per circle.
2508    pub fn cycle_unit_circle(num_steps: u32) -> Box<dyn ArrayGenerator> {
2509        Box::new(RadialStepGenerator::new(num_steps))
2510    }
2511
2512    /// Create a generator of vectors by cycling through a given set of vectors
2513    ///
2514    /// Each value will be spaced in slightly away from the previous value on a ball of radius jitter
2515    pub fn jitter_centroids(centroids: Arc<dyn Array>, jitter: f32) -> Box<dyn ArrayGenerator> {
2516        Box::new(JitterCentroidsGenerator::try_new(centroids, jitter).unwrap())
2517    }
2518
2519    /// Create a generator from a vector of values
2520    ///
2521    /// If more rows are requested than the length of values then it will restart
2522    /// from the beginning of the vector.
2523    pub fn cycle<DataType>(values: Vec<DataType::Native>) -> Box<dyn ArrayGenerator>
2524    where
2525        DataType::Native: Copy + 'static,
2526        DataType: ArrowPrimitiveType,
2527        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2528    {
2529        let mut values_idx = 0;
2530        Box::new(
2531            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2532                DataType::DATA_TYPE,
2533                move |_| {
2534                    let y = values[values_idx];
2535                    values_idx = (values_idx + 1) % values.len();
2536                    y
2537                },
2538                1,
2539                DataType::DATA_TYPE
2540                    .primitive_width()
2541                    .map(|width| ByteCount::from(width as u64))
2542                    .expect("Primitive types should have a fixed width"),
2543            ),
2544        )
2545    }
2546
2547    /// Create a generator from a vector of booleans
2548    ///
2549    /// If more rows are requested than the length of values then it will restart from
2550    /// the beginning of the vector
2551    pub fn cycle_bool(values: Vec<bool>) -> Box<dyn ArrayGenerator> {
2552        let mut values_idx = 0;
2553        Box::new(FnGen::<bool, BooleanArray, _>::new_unknown_size(
2554            DataType::Boolean,
2555            move |_| {
2556                let val = values[values_idx];
2557                values_idx = (values_idx + 1) % values.len();
2558                val
2559            },
2560            1,
2561        ))
2562    }
2563
2564    /// Create a generator that starts at 0 and increments by 1 for each element
2565    pub fn step<DataType>() -> Box<dyn ArrayGenerator>
2566    where
2567        DataType::Native: Copy + Default + std::ops::AddAssign<DataType::Native> + 'static,
2568        DataType: ArrowPrimitiveType,
2569        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2570    {
2571        let mut x = DataType::Native::default();
2572        Box::new(
2573            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2574                DataType::DATA_TYPE,
2575                move |_| {
2576                    let y = x;
2577                    x += DataType::Native::ONE;
2578                    y
2579                },
2580                1,
2581                DataType::DATA_TYPE
2582                    .primitive_width()
2583                    .map(|width| ByteCount::from(width as u64))
2584                    .expect("Primitive types should have a fixed width"),
2585            ),
2586        )
2587    }
2588
2589    pub fn blob() -> Box<dyn ArrayGenerator> {
2590        let mut blob_meta = HashMap::new();
2591        blob_meta.insert("lance-encoding:blob".to_string(), "true".to_string());
2592        rand_fixedbin(ByteCount::from(4 * 1024 * 1024), true).with_metadata(blob_meta)
2593    }
2594
2595    /// Create a generator that starts at a given value and increments by a given step for each element
2596    pub fn step_custom<DataType>(
2597        start: DataType::Native,
2598        step: DataType::Native,
2599    ) -> Box<dyn ArrayGenerator>
2600    where
2601        DataType::Native: Copy + Default + std::ops::AddAssign<DataType::Native> + 'static,
2602        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2603        DataType: ArrowPrimitiveType,
2604    {
2605        let mut x = start;
2606        Box::new(
2607            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2608                DataType::DATA_TYPE,
2609                move |_| {
2610                    let y = x;
2611                    x += step;
2612                    y
2613                },
2614                1,
2615                DataType::DATA_TYPE
2616                    .primitive_width()
2617                    .map(|width| ByteCount::from(width as u64))
2618                    .expect("Primitive types should have a fixed width"),
2619            ),
2620        )
2621    }
2622
2623    /// Create a generator that fills each element with the given primitive value
2624    pub fn fill<DataType>(value: DataType::Native) -> Box<dyn ArrayGenerator>
2625    where
2626        DataType::Native: Copy + 'static,
2627        DataType: ArrowPrimitiveType,
2628        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2629    {
2630        Box::new(
2631            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2632                DataType::DATA_TYPE,
2633                move |_| value,
2634                1,
2635                DataType::DATA_TYPE
2636                    .primitive_width()
2637                    .map(|width| ByteCount::from(width as u64))
2638                    .expect("Primitive types should have a fixed width"),
2639            ),
2640        )
2641    }
2642
2643    /// Create a generator that fills each element with the given binary value
2644    pub fn fill_varbin(value: Vec<u8>) -> Box<dyn ArrayGenerator> {
2645        Box::new(FixedBinaryGenerator::<BinaryType>::new(value))
2646    }
2647
2648    /// Create a generator that fills each element with the given string value
2649    pub fn fill_utf8(value: String) -> Box<dyn ArrayGenerator> {
2650        Box::new(FixedBinaryGenerator::<Utf8Type>::new(value.into_bytes()))
2651    }
2652
2653    pub fn cycle_utf8_literals(values: &[&'static str]) -> Box<dyn ArrayGenerator> {
2654        Box::new(CycleBinaryGenerator::<Utf8Type>::from_strings(values))
2655    }
2656
2657    /// Create a generator of primitive values that are randomly sampled from the entire range available for the value
2658    pub fn rand<DataType>() -> Box<dyn ArrayGenerator>
2659    where
2660        DataType::Native: Copy + 'static,
2661        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2662        DataType: ArrowPrimitiveType,
2663        rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2664    {
2665        Box::new(
2666            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2667                DataType::DATA_TYPE,
2668                move |rng| rng.random(),
2669                1,
2670                DataType::DATA_TYPE
2671                    .primitive_width()
2672                    .map(|width| ByteCount::from(width as u64))
2673                    .expect("Primitive types should have a fixed width"),
2674            ),
2675        )
2676    }
2677
2678    /// Create a generator of primitive values that are randomly sampled from the entire range available for the value
2679    pub fn rand_with_distribution<
2680        DataType,
2681        Dist: rand::distr::Distribution<DataType::Native> + Clone + Send + Sync + 'static,
2682    >(
2683        dist: Dist,
2684    ) -> Box<dyn ArrayGenerator>
2685    where
2686        DataType::Native: Copy + 'static,
2687        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2688        DataType: ArrowPrimitiveType,
2689    {
2690        Box::new(
2691            FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2692                DataType::DATA_TYPE,
2693                move |rng| rng.sample(dist.clone()),
2694                1,
2695                DataType::DATA_TYPE
2696                    .primitive_width()
2697                    .map(|width| ByteCount::from(width as u64))
2698                    .expect("Primitive types should have a fixed width"),
2699            ),
2700        )
2701    }
2702
2703    /// Create a generator of 1d vectors (of a primitive type) consisting of randomly sampled primitive values
2704    pub fn rand_vec<DataType>(dimension: Dimension) -> Box<dyn ArrayGenerator>
2705    where
2706        DataType::Native: Copy + 'static,
2707        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2708        DataType: ArrowPrimitiveType,
2709        rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2710    {
2711        let underlying = rand::<DataType>();
2712        cycle_vec(underlying, dimension)
2713    }
2714
2715    /// Create a generator of 1d vectors (of a primitive type) consisting of randomly sampled nullable values
2716    pub fn rand_vec_nullable<DataType>(
2717        dimension: Dimension,
2718        null_probability: f64,
2719    ) -> Box<dyn ArrayGenerator>
2720    where
2721        DataType::Native: Copy + 'static,
2722        PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2723        DataType: ArrowPrimitiveType,
2724        rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2725    {
2726        let underlying = rand::<DataType>().with_random_nulls(null_probability);
2727        cycle_vec(underlying, dimension)
2728    }
2729
2730    /// Create a generator of randomly sampled time32 values covering the entire
2731    /// range of 1 day
2732    pub fn rand_time32(resolution: &TimeUnit) -> Box<dyn ArrayGenerator> {
2733        let start = 0;
2734        let end = match resolution {
2735            TimeUnit::Second => 86_400,
2736            TimeUnit::Millisecond => 86_400_000,
2737            _ => panic!(),
2738        };
2739
2740        let data_type = DataType::Time32(*resolution);
2741        let size = ByteCount::from(data_type.primitive_width().unwrap() as u64);
2742        let dist = Uniform::new(start, end).unwrap();
2743        let sample_fn = move |rng: &mut _| dist.sample(rng);
2744
2745        match resolution {
2746            TimeUnit::Second => Box::new(FnGen::<i32, Time32SecondArray, _>::new_known_size(
2747                data_type, sample_fn, 1, size,
2748            )),
2749            TimeUnit::Millisecond => {
2750                Box::new(FnGen::<i32, Time32MillisecondArray, _>::new_known_size(
2751                    data_type, sample_fn, 1, size,
2752                ))
2753            }
2754            _ => panic!(),
2755        }
2756    }
2757
2758    /// Create a generator of randomly sampled time64 values covering the entire
2759    /// range of 1 day
2760    pub fn rand_time64(resolution: &TimeUnit) -> Box<dyn ArrayGenerator> {
2761        let start = 0_i64;
2762        let end: i64 = match resolution {
2763            TimeUnit::Microsecond => 86_400_000,
2764            TimeUnit::Nanosecond => 86_400_000_000,
2765            _ => panic!(),
2766        };
2767
2768        let data_type = DataType::Time64(*resolution);
2769        let size = ByteCount::from(data_type.primitive_width().unwrap() as u64);
2770        let dist = Uniform::new(start, end).unwrap();
2771        let sample_fn = move |rng: &mut _| dist.sample(rng);
2772
2773        match resolution {
2774            TimeUnit::Microsecond => {
2775                Box::new(FnGen::<i64, Time64MicrosecondArray, _>::new_known_size(
2776                    data_type, sample_fn, 1, size,
2777                ))
2778            }
2779            TimeUnit::Nanosecond => {
2780                Box::new(FnGen::<i64, Time64NanosecondArray, _>::new_known_size(
2781                    data_type, sample_fn, 1, size,
2782                ))
2783            }
2784            _ => panic!(),
2785        }
2786    }
2787
2788    /// Create a generator of random UUIDs, stored as fixed size binary values
2789    ///
2790    /// Note, these are "pseudo UUIDs".  They are 16-byte randomish values but they
2791    /// are not guaranteed to be unique.  We use a simplistic RNG that trades uniqueness
2792    /// for speed.
2793    pub fn rand_pseudo_uuid() -> Box<dyn ArrayGenerator> {
2794        Box::<PseudoUuidGenerator>::default()
2795    }
2796
2797    /// Create a generator of random UUIDs, stored as 32-character strings (hex encoding
2798    /// of the 16-byte binary value)
2799    ///
2800    /// Note, these are "pseudo UUIDs".  They are 16-byte randomish values but they
2801    /// are not guaranteed to be unique.  We use a simplistic RNG that trades uniqueness
2802    /// for speed.
2803    pub fn rand_pseudo_uuid_hex() -> Box<dyn ArrayGenerator> {
2804        Box::<PseudoUuidHexGenerator>::default()
2805    }
2806
2807    pub fn rand_primitive<T: ArrowPrimitiveType + Send + Sync>(
2808        data_type: DataType,
2809    ) -> Box<dyn ArrayGenerator> {
2810        Box::new(RandomBytesGenerator::<T>::new(data_type))
2811    }
2812
2813    pub fn rand_fsb(size: i32) -> Box<dyn ArrayGenerator> {
2814        Box::new(RandomFixedSizeBinaryGenerator::new(size))
2815    }
2816
2817    pub fn rand_interval(unit: IntervalUnit) -> Box<dyn ArrayGenerator> {
2818        Box::new(RandomIntervalGenerator::new(unit))
2819    }
2820
2821    /// Create a generator of randomly sampled date32 values
2822    ///
2823    /// Instead of sampling the entire range, all values will be drawn from the last year as this
2824    /// is a more common use pattern
2825    pub fn rand_date32() -> Box<dyn ArrayGenerator> {
2826        let now = chrono::Utc::now();
2827        let one_year_ago = now - chrono::TimeDelta::try_days(365).expect("TimeDelta try days");
2828        rand_date32_in_range(one_year_ago, now)
2829    }
2830
2831    /// Create a generator of randomly sampled date32 values in the given range
2832    pub fn rand_date32_in_range(
2833        start: chrono::DateTime<Utc>,
2834        end: chrono::DateTime<Utc>,
2835    ) -> Box<dyn ArrayGenerator> {
2836        let data_type = DataType::Date32;
2837        let end_ms = end.timestamp_millis();
2838        let end_days = (end_ms / MS_PER_DAY) as i32;
2839        let start_ms = start.timestamp_millis();
2840        let start_days = (start_ms / MS_PER_DAY) as i32;
2841        let dist = Uniform::new(start_days, end_days).unwrap();
2842
2843        Box::new(FnGen::<i32, Date32Array, _>::new_known_size(
2844            data_type,
2845            move |rng| dist.sample(rng),
2846            1,
2847            DataType::Date32
2848                .primitive_width()
2849                .map(|width| ByteCount::from(width as u64))
2850                .expect("Date32 should have a fixed width"),
2851        ))
2852    }
2853
2854    /// Create a generator of randomly sampled date64 values
2855    ///
2856    /// Instead of sampling the entire range, all values will be drawn from the last year as this
2857    /// is a more common use pattern
2858    pub fn rand_date64() -> Box<dyn ArrayGenerator> {
2859        let now = chrono::Utc::now();
2860        let one_year_ago = now - chrono::TimeDelta::try_days(365).expect("TimeDelta try_days");
2861        rand_date64_in_range(one_year_ago, now)
2862    }
2863
2864    /// Create a generator of randomly sampled timestamp values in the given range
2865    ///
2866    /// Currently just samples the entire range of u64 values and casts to timestamp
2867    pub fn rand_timestamp_in_range(
2868        start: chrono::DateTime<Utc>,
2869        end: chrono::DateTime<Utc>,
2870        data_type: &DataType,
2871    ) -> Box<dyn ArrayGenerator> {
2872        let end_ms = end.timestamp_millis();
2873        let start_ms = start.timestamp_millis();
2874        let (start_ticks, end_ticks) = match data_type {
2875            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
2876                (start_ms * 1000 * 1000, end_ms * 1000 * 1000)
2877            }
2878            DataType::Timestamp(TimeUnit::Microsecond, _) => (start_ms * 1000, end_ms * 1000),
2879            DataType::Timestamp(TimeUnit::Millisecond, _) => (start_ms, end_ms),
2880            DataType::Timestamp(TimeUnit::Second, _) => (start.timestamp(), end.timestamp()),
2881            _ => panic!(),
2882        };
2883        let dist = Uniform::new(start_ticks, end_ticks).unwrap();
2884
2885        let data_type = data_type.clone();
2886        let sample_fn = move |rng: &mut _| dist.sample(rng);
2887        let width = data_type
2888            .primitive_width()
2889            .map(|width| ByteCount::from(width as u64))
2890            .unwrap();
2891
2892        match data_type {
2893            DataType::Timestamp(TimeUnit::Nanosecond, _) => {
2894                Box::new(FnGen::<i64, TimestampNanosecondArray, _>::new_known_size(
2895                    data_type, sample_fn, 1, width,
2896                ))
2897            }
2898            DataType::Timestamp(TimeUnit::Microsecond, _) => {
2899                Box::new(FnGen::<i64, TimestampMicrosecondArray, _>::new_known_size(
2900                    data_type, sample_fn, 1, width,
2901                ))
2902            }
2903            DataType::Timestamp(TimeUnit::Millisecond, _) => {
2904                Box::new(FnGen::<i64, TimestampMillisecondArray, _>::new_known_size(
2905                    data_type, sample_fn, 1, width,
2906                ))
2907            }
2908            DataType::Timestamp(TimeUnit::Second, _) => {
2909                Box::new(FnGen::<i64, TimestampSecondArray, _>::new_known_size(
2910                    data_type, sample_fn, 1, width,
2911                ))
2912            }
2913            _ => panic!(),
2914        }
2915    }
2916
2917    pub fn rand_timestamp(data_type: &DataType) -> Box<dyn ArrayGenerator> {
2918        let now = chrono::Utc::now();
2919        let one_year_ago = now - chrono::Duration::try_days(365).unwrap();
2920        rand_timestamp_in_range(one_year_ago, now, data_type)
2921    }
2922
2923    /// Create a generator of randomly sampled date64 values
2924    ///
2925    /// Instead of sampling the entire range, all values will be drawn from the last year as this
2926    /// is a more common use pattern
2927    pub fn rand_date64_in_range(
2928        start: chrono::DateTime<Utc>,
2929        end: chrono::DateTime<Utc>,
2930    ) -> Box<dyn ArrayGenerator> {
2931        let data_type = DataType::Date64;
2932        let end_ms = end.timestamp_millis();
2933        let end_days = end_ms / MS_PER_DAY;
2934        let start_ms = start.timestamp_millis();
2935        let start_days = start_ms / MS_PER_DAY;
2936        let dist = Uniform::new(start_days, end_days).unwrap();
2937
2938        Box::new(FnGen::<i64, Date64Array, _>::new_known_size(
2939            data_type,
2940            move |rng| (dist.sample(rng)) * MS_PER_DAY,
2941            1,
2942            DataType::Date64
2943                .primitive_width()
2944                .map(|width| ByteCount::from(width as u64))
2945                .expect("Date64 should have a fixed width"),
2946        ))
2947    }
2948
2949    /// Create a generator of random binary values where each value has a fixed number of bytes
2950    pub fn rand_fixedbin(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
2951        Box::new(RandomBinaryGenerator::new(
2952            bytes_per_element,
2953            false,
2954            is_large,
2955        ))
2956    }
2957
2958    /// Create a generator of random binary values where each value has a variable number of bytes
2959    ///
2960    /// The number of bytes per element will be randomly sampled from the given (inclusive) range
2961    pub fn rand_varbin(
2962        min_bytes_per_element: ByteCount,
2963        max_bytes_per_element: ByteCount,
2964    ) -> Box<dyn ArrayGenerator> {
2965        Box::new(VariableRandomBinaryGenerator::new(
2966            min_bytes_per_element,
2967            max_bytes_per_element,
2968        ))
2969    }
2970
2971    /// Create a generator of random strings
2972    ///
2973    /// All strings will consist entirely of printable ASCII characters
2974    pub fn rand_utf8(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
2975        Box::new(RandomBinaryGenerator::new(
2976            bytes_per_element,
2977            true,
2978            is_large,
2979        ))
2980    }
2981
2982    /// Creates a generator of strings with a prefix and a counter
2983    ///
2984    /// For example, if the prefix is "user_" the strings will be "user_0", "user_1", ...
2985    pub fn utf8_prefix_plus_counter(
2986        prefix: impl Into<String>,
2987        is_large: bool,
2988    ) -> Box<dyn ArrayGenerator> {
2989        Box::new(PrefixPlusCounterGenerator::new(prefix.into(), is_large))
2990    }
2991
2992    pub fn binary_prefix_plus_counter(
2993        prefix: Arc<[u8]>,
2994        is_large: bool,
2995    ) -> Box<dyn ArrayGenerator> {
2996        Box::new(BinaryPrefixPlusCounterGenerator::new(prefix, is_large))
2997    }
2998
2999    /// Create a random generator of boolean values
3000    pub fn rand_boolean() -> Box<dyn ArrayGenerator> {
3001        Box::<RandomBooleanGenerator>::default()
3002    }
3003
3004    /// Create a generator of random sentences
3005    ///
3006    /// Generates strings containing between min_words and max_words random English words joined by spaces
3007    pub fn random_sentence(
3008        min_words: usize,
3009        max_words: usize,
3010        is_large: bool,
3011    ) -> Box<dyn ArrayGenerator> {
3012        Box::new(RandomSentenceGenerator::new(min_words, max_words, is_large))
3013    }
3014
3015    /// Create a generator of random words (one word per row)
3016    ///
3017    /// Generates strings containing a single random English word per row
3018    pub fn random_word(is_large: bool) -> Box<dyn ArrayGenerator> {
3019        Box::new(RandomWordGenerator::new(is_large))
3020    }
3021
3022    pub fn rand_list(item_type: &DataType, is_large: bool) -> Box<dyn ArrayGenerator> {
3023        let child_gen = rand_type(item_type);
3024        Box::new(RandomListGenerator::new(child_gen, is_large))
3025    }
3026
3027    pub fn rand_list_any(
3028        item_gen: Box<dyn ArrayGenerator>,
3029        is_large: bool,
3030    ) -> Box<dyn ArrayGenerator> {
3031        Box::new(RandomListGenerator::new(item_gen, is_large))
3032    }
3033
3034    /// Generates random map arrays where each map has 0-4 entries.
3035    pub fn rand_map(key_type: &DataType, value_type: &DataType) -> Box<dyn ArrayGenerator> {
3036        let keys_gen = rand_type(key_type);
3037        let values_gen = rand_type(value_type);
3038        Box::new(RandomMapGenerator::new(keys_gen, values_gen))
3039    }
3040
3041    pub fn rand_struct(fields: Fields) -> Box<dyn ArrayGenerator> {
3042        let child_gens = fields
3043            .iter()
3044            .map(|f| rand_type(f.data_type()))
3045            .collect::<Vec<_>>();
3046        Box::new(RandomStructGenerator::new(fields, child_gens))
3047    }
3048
3049    pub fn null_type() -> Box<dyn ArrayGenerator> {
3050        Box::new(NullArrayGenerator {})
3051    }
3052
3053    /// Create a generator of random values
3054    pub fn rand_type(data_type: &DataType) -> Box<dyn ArrayGenerator> {
3055        match data_type {
3056            DataType::Boolean => rand_boolean(),
3057            DataType::Int8 => rand::<Int8Type>(),
3058            DataType::Int16 => rand::<Int16Type>(),
3059            DataType::Int32 => rand::<Int32Type>(),
3060            DataType::Int64 => rand::<Int64Type>(),
3061            DataType::UInt8 => rand::<UInt8Type>(),
3062            DataType::UInt16 => rand::<UInt16Type>(),
3063            DataType::UInt32 => rand::<UInt32Type>(),
3064            DataType::UInt64 => rand::<UInt64Type>(),
3065            DataType::Float16 => rand_primitive::<Float16Type>(data_type.clone()),
3066            DataType::Float32 => rand::<Float32Type>(),
3067            DataType::Float64 => rand::<Float64Type>(),
3068            DataType::Decimal128(_, _) => rand_primitive::<Decimal128Type>(data_type.clone()),
3069            DataType::Decimal256(_, _) => rand_primitive::<Decimal256Type>(data_type.clone()),
3070            DataType::Utf8 => rand_utf8(ByteCount::from(12), false),
3071            DataType::LargeUtf8 => rand_utf8(ByteCount::from(12), true),
3072            DataType::Binary => rand_fixedbin(ByteCount::from(12), false),
3073            DataType::LargeBinary => rand_fixedbin(ByteCount::from(12), true),
3074            DataType::Dictionary(key_type, value_type) => {
3075                dict_type(rand_type(value_type), key_type)
3076            }
3077            DataType::FixedSizeList(child, dimension) => cycle_vec(
3078                rand_type(child.data_type()),
3079                Dimension::from(*dimension as u32),
3080            ),
3081            DataType::FixedSizeBinary(size) => rand_fsb(*size),
3082            DataType::List(child) => rand_list(child.data_type(), false),
3083            DataType::LargeList(child) => rand_list(child.data_type(), true),
3084            DataType::Map(entries_field, _) => {
3085                let DataType::Struct(fields) = entries_field.data_type() else {
3086                    panic!("Map entries field must be a struct");
3087                };
3088                let key_type = fields[0].data_type();
3089                let value_type = fields[1].data_type();
3090                rand_map(key_type, value_type)
3091            }
3092            DataType::Duration(unit) => match unit {
3093                TimeUnit::Second => rand::<DurationSecondType>(),
3094                TimeUnit::Millisecond => rand::<DurationMillisecondType>(),
3095                TimeUnit::Microsecond => rand::<DurationMicrosecondType>(),
3096                TimeUnit::Nanosecond => rand::<DurationNanosecondType>(),
3097            },
3098            DataType::Interval(unit) => rand_interval(*unit),
3099            DataType::Date32 => rand_date32(),
3100            DataType::Date64 => rand_date64(),
3101            DataType::Time32(resolution) => rand_time32(resolution),
3102            DataType::Time64(resolution) => rand_time64(resolution),
3103            DataType::Timestamp(_, _) => rand_timestamp(data_type),
3104            DataType::Struct(fields) => rand_struct(fields.clone()),
3105            DataType::Null => null_type(),
3106            _ => unimplemented!("random generation of {}", data_type),
3107        }
3108    }
3109
3110    /// Encodes arrays generated by the underlying generator as dictionaries with the given key type
3111    ///
3112    /// Note that this may not be very realistic if the underlying generator is something like a random
3113    /// generator since most of the underlying values will be unique and the common case for dictionary
3114    /// encoding is when there is a small set of possible values.
3115    pub fn dict<K: ArrowDictionaryKeyType + Send + Sync>(
3116        generator: Box<dyn ArrayGenerator>,
3117    ) -> Box<dyn ArrayGenerator> {
3118        Box::new(DictionaryGenerator::<K>::new(generator))
3119    }
3120
3121    /// Encodes arrays generated by the underlying generator as dictionaries with the given key type
3122    pub fn dict_type(
3123        generator: Box<dyn ArrayGenerator>,
3124        key_type: &DataType,
3125    ) -> Box<dyn ArrayGenerator> {
3126        match key_type {
3127            DataType::Int8 => dict::<Int8Type>(generator),
3128            DataType::Int16 => dict::<Int16Type>(generator),
3129            DataType::Int32 => dict::<Int32Type>(generator),
3130            DataType::Int64 => dict::<Int64Type>(generator),
3131            DataType::UInt8 => dict::<UInt8Type>(generator),
3132            DataType::UInt16 => dict::<UInt16Type>(generator),
3133            DataType::UInt32 => dict::<UInt32Type>(generator),
3134            DataType::UInt64 => dict::<UInt64Type>(generator),
3135            _ => unimplemented!(),
3136        }
3137    }
3138
3139    /// Wraps a generator to produce low-cardinality data.
3140    ///
3141    /// Generates `cardinality` unique values on first call, then randomly
3142    /// selects from them for all subsequent rows.
3143    pub fn low_cardinality(
3144        generator: Box<dyn ArrayGenerator>,
3145        cardinality: usize,
3146    ) -> Box<dyn ArrayGenerator> {
3147        Box::new(LowCardinalityGenerator::new(generator, cardinality))
3148    }
3149}
3150
3151/// Create a BatchGeneratorBuilder to start generating batch data
3152pub fn gen_batch() -> BatchGeneratorBuilder {
3153    BatchGeneratorBuilder::default()
3154}
3155
3156/// Create an ArrayGeneratorBuilder to start generating array data
3157pub fn gen_array(genn: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
3158    ArrayGeneratorBuilder::new(genn)
3159}
3160
3161/// Metadata key to specify content type for string generation.
3162/// Set to "sentence" to use the sentence generator with Zipf distribution.
3163pub const CONTENT_TYPE_KEY: &str = "lance-datagen:content-type";
3164
3165/// Metadata key to specify cardinality for low-cardinality data generation.
3166/// Set to a numeric string (e.g., "100") to limit unique values.
3167pub const CARDINALITY_KEY: &str = "lance-datagen:cardinality";
3168
3169/// Create a generator for a field, checking metadata for content type hints.
3170///
3171/// Supported metadata keys:
3172/// - `lance-datagen:content-type`: Set to "sentence" for Utf8/LargeUtf8 fields
3173///   to use the sentence generator with Zipf distribution.
3174/// - `lance-datagen:cardinality`: Set to a number to limit unique values.
3175///   The generator will produce only that many unique values and randomly
3176///   select from them.
3177pub fn rand_field(field: &Field) -> Box<dyn ArrayGenerator> {
3178    let mut generator = if let Some(content_type) = field.metadata().get(CONTENT_TYPE_KEY) {
3179        match (content_type.as_str(), field.data_type()) {
3180            ("sentence", DataType::Utf8) => array::random_sentence(1, 10, false),
3181            ("sentence", DataType::LargeUtf8) => array::random_sentence(1, 10, true),
3182            _ => array::rand_type(field.data_type()),
3183        }
3184    } else {
3185        array::rand_type(field.data_type())
3186    };
3187
3188    if let Some(cardinality_str) = field.metadata().get(CARDINALITY_KEY)
3189        && let Ok(cardinality) = cardinality_str.parse::<usize>()
3190        && cardinality > 0
3191    {
3192        generator = array::low_cardinality(generator, cardinality);
3193    }
3194
3195    generator
3196}
3197
3198/// Create a BatchGeneratorBuilder with the given schema
3199///
3200/// You can add more columns or convert this into a reader immediately.
3201///
3202/// Supported field metadata:
3203/// - `lance-datagen:content-type` = `"sentence"`: Use sentence generator with
3204///   Zipf distribution for more realistic text (Utf8/LargeUtf8 only).
3205/// - `lance-datagen:cardinality` = `"<number>"`: Limit to N unique values.
3206pub fn rand(schema: &Schema) -> BatchGeneratorBuilder {
3207    let mut builder = BatchGeneratorBuilder::default();
3208    for field in schema.fields() {
3209        builder = builder.col(field.name(), rand_field(field));
3210    }
3211    builder
3212}
3213
3214#[cfg(test)]
3215mod tests {
3216
3217    use arrow::datatypes::{Float32Type, Int8Type, Int16Type, UInt32Type};
3218    use arrow_array::{BooleanArray, Float32Array, Int8Array, Int16Array, Int32Array, UInt32Array};
3219
3220    use super::*;
3221
3222    #[test]
3223    fn test_step() {
3224        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3225        let mut genn = array::step::<Int32Type>();
3226        assert_eq!(
3227            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3228            Int32Array::from_iter([0, 1, 2, 3, 4])
3229        );
3230        assert_eq!(
3231            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3232            Int32Array::from_iter([5, 6, 7, 8, 9])
3233        );
3234
3235        let mut genn = array::step::<Int8Type>();
3236        assert_eq!(
3237            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3238            Int8Array::from_iter([0, 1, 2])
3239        );
3240
3241        let mut genn = array::step::<Float32Type>();
3242        assert_eq!(
3243            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3244            Float32Array::from_iter([0.0, 1.0, 2.0])
3245        );
3246
3247        let mut genn = array::step_custom::<Int16Type>(4, 8);
3248        assert_eq!(
3249            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3250            Int16Array::from_iter([4, 12, 20])
3251        );
3252        assert_eq!(
3253            *genn.generate(RowCount::from(2), &mut rng).unwrap(),
3254            Int16Array::from_iter([28, 36])
3255        );
3256    }
3257
3258    #[test]
3259    fn test_cycle() {
3260        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3261        let mut genn = array::cycle::<Int32Type>(vec![1, 2, 3]);
3262        assert_eq!(
3263            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3264            Int32Array::from_iter([1, 2, 3, 1, 2])
3265        );
3266
3267        let mut genn = array::cycle_utf8_literals(&["abc", "def", "xyz"]);
3268        assert_eq!(
3269            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3270            StringArray::from_iter_values(["abc", "def", "xyz", "abc", "def"])
3271        );
3272        assert_eq!(
3273            *genn.generate(RowCount::from(1), &mut rng).unwrap(),
3274            StringArray::from_iter_values(["xyz"])
3275        );
3276
3277        let mut genn = array::cycle_bool(vec![false, false, true]);
3278        assert_eq!(
3279            *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3280            BooleanArray::from_iter(vec![false, false, true, false, false].into_iter().map(Some))
3281        );
3282        assert_eq!(
3283            *genn.generate(RowCount::from(1), &mut rng).unwrap(),
3284            BooleanArray::from_iter(vec![Some(true)])
3285        )
3286    }
3287
3288    #[test]
3289    fn test_fill() {
3290        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3291        let mut genn = array::fill::<Int32Type>(42);
3292        assert_eq!(
3293            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3294            Int32Array::from_iter([42, 42, 42])
3295        );
3296        assert_eq!(
3297            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3298            Int32Array::from_iter([42, 42, 42])
3299        );
3300
3301        let mut genn = array::fill_varbin(vec![0, 1, 2]);
3302        assert_eq!(
3303            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3304            arrow_array::BinaryArray::from_iter_values([
3305                "\x00\x01\x02",
3306                "\x00\x01\x02",
3307                "\x00\x01\x02"
3308            ])
3309        );
3310
3311        let mut genn = array::fill_utf8("xyz".to_string());
3312        assert_eq!(
3313            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3314            arrow_array::StringArray::from_iter_values(["xyz", "xyz", "xyz"])
3315        );
3316    }
3317
3318    #[test]
3319    fn test_utf8_prefix_plus_counter() {
3320        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3321        let mut genn = array::utf8_prefix_plus_counter("user_", false);
3322        assert_eq!(
3323            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3324            arrow_array::StringArray::from_iter_values(["user_0", "user_1", "user_2"])
3325        );
3326
3327        let mut genn = array::utf8_prefix_plus_counter("user_", true);
3328        assert_eq!(
3329            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3330            arrow_array::LargeStringArray::from_iter_values(["user_0", "user_1", "user_2"])
3331        );
3332    }
3333
3334    #[test]
3335    fn test_rng() {
3336        // Note: these tests are heavily dependent on the default seed.
3337        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3338        let mut genn = array::rand::<Int32Type>();
3339        assert_eq!(
3340            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3341            Int32Array::from_iter([-797553329, 1369325940, -69174021])
3342        );
3343
3344        let mut genn = array::rand_fixedbin(ByteCount::from(3), false);
3345        assert_eq!(
3346            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3347            arrow_array::BinaryArray::from_iter_values([
3348                [184, 53, 216],
3349                [12, 96, 159],
3350                [125, 179, 56]
3351            ])
3352        );
3353
3354        let mut genn = array::rand_utf8(ByteCount::from(3), false);
3355        assert_eq!(
3356            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3357            arrow_array::StringArray::from_iter_values([">@p", "n `", "NWa"])
3358        );
3359
3360        let mut genn = array::random_sentence(1, 5, false);
3361        let words = genn.generate(RowCount::from(10), &mut rng).unwrap();
3362        assert_eq!(words.data_type(), &DataType::Utf8);
3363        let words_array = words.as_any().downcast_ref::<StringArray>().unwrap();
3364        // Verify each string contains 1-5 words
3365        for i in 0..10 {
3366            let sentence = words_array.value(i);
3367            let word_count = sentence.split_whitespace().count();
3368            assert!((1..=5).contains(&word_count));
3369        }
3370
3371        let mut genn = array::rand_date32();
3372        let days_32 = genn.generate(RowCount::from(3), &mut rng).unwrap();
3373        assert_eq!(days_32.data_type(), &DataType::Date32);
3374
3375        let mut genn = array::rand_date64();
3376        let days_64 = genn.generate(RowCount::from(3), &mut rng).unwrap();
3377        assert_eq!(days_64.data_type(), &DataType::Date64);
3378
3379        let mut genn = array::rand_boolean();
3380        let bools = genn.generate(RowCount::from(1024), &mut rng).unwrap();
3381        assert_eq!(bools.data_type(), &DataType::Boolean);
3382        let bools = bools.as_any().downcast_ref::<BooleanArray>().unwrap();
3383        // Sanity check to ensure we're getting at least some rng
3384        assert!(bools.false_count() > 100);
3385        assert!(bools.true_count() > 100);
3386
3387        let mut genn = array::rand_varbin(ByteCount::from(2), ByteCount::from(4));
3388        assert_eq!(
3389            *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3390            arrow_array::BinaryArray::from_iter_values([
3391                vec![111, 9, 80],
3392                vec![86, 118, 13, 209],
3393                vec![68, 33, 202]
3394            ])
3395        );
3396    }
3397
3398    #[test]
3399    fn test_rng_list() {
3400        // Note: these tests are heavily dependent on the default seed.
3401        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3402        let mut genn = array::rand_list(&DataType::Int32, false);
3403        let arr = genn.generate(RowCount::from(100), &mut rng).unwrap();
3404        // Make sure we can generate empty lists (note, test is dependent on seed)
3405        let arr = arr.as_list::<i32>();
3406        assert!(arr.iter().any(|l| l.unwrap().is_empty()));
3407        // Shouldn't generate any giant lists (don't kill performance in normal datagen)
3408        assert!(arr.iter().any(|l| l.unwrap().len() < 11));
3409    }
3410
3411    #[test]
3412    fn test_rng_distribution() {
3413        // Sanity test to make sure we our RNG is giving us well distributed values
3414        // We generates some 4-byte integers, histogram them into 8 buckets, and make
3415        // sure each bucket has a good # of values
3416        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3417        let mut genn = array::rand::<UInt32Type>();
3418        for _ in 0..10 {
3419            let arr = genn.generate(RowCount::from(10000), &mut rng).unwrap();
3420            let int_arr = arr.as_any().downcast_ref::<UInt32Array>().unwrap();
3421            let mut buckets = vec![0_u32; 256];
3422            for val in int_arr.values() {
3423                buckets[(*val >> 24) as usize] += 1;
3424            }
3425            for bucket in buckets {
3426                // Perfectly even distribution would have 10000 / 256 values (~40) per bucket
3427                // We test for 15 which should be "good enough" and statistically unlikely to fail
3428                assert!(bucket > 15);
3429            }
3430        }
3431    }
3432
3433    #[test]
3434    fn test_nulls() {
3435        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3436        let mut genn = array::rand::<Int32Type>().with_random_nulls(0.3);
3437
3438        let arr = genn.generate(RowCount::from(1000), &mut rng).unwrap();
3439
3440        // This assert depends on the default seed
3441        assert_eq!(arr.null_count(), 297);
3442
3443        for len in 0..100 {
3444            let arr = genn.generate(RowCount::from(len), &mut rng).unwrap();
3445            // Make sure the null count we came up with matches the actual # of unset bits
3446            assert_eq!(
3447                arr.null_count(),
3448                arr.nulls()
3449                    .map(|nulls| (len as usize)
3450                        - nulls.buffer().count_set_bits_offset(0, len as usize))
3451                    .unwrap_or(0)
3452            );
3453        }
3454
3455        let mut genn = array::rand::<Int32Type>().with_random_nulls(0.0);
3456        let arr = genn.generate(RowCount::from(10), &mut rng).unwrap();
3457
3458        assert_eq!(arr.null_count(), 0);
3459
3460        let mut genn = array::rand::<Int32Type>().with_random_nulls(1.0);
3461        let arr = genn.generate(RowCount::from(10), &mut rng).unwrap();
3462
3463        assert_eq!(arr.null_count(), 10);
3464        assert!((0..10).all(|idx| arr.is_null(idx)));
3465
3466        let mut genn = array::rand::<Int32Type>().with_nulls(&[false, false, true]);
3467        let arr = genn.generate(RowCount::from(7), &mut rng).unwrap();
3468        assert!((0..2).all(|idx| arr.is_valid(idx)));
3469        assert!(arr.is_null(2));
3470        assert!((3..5).all(|idx| arr.is_valid(idx)));
3471        assert!(arr.is_null(5));
3472        assert!(arr.is_valid(6));
3473    }
3474
3475    #[test]
3476    fn test_unit_circle() {
3477        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3478        let mut genn = array::cycle_unit_circle(4);
3479        let arr = genn.generate(RowCount::from(6), &mut rng).unwrap();
3480
3481        let arr_values = arr
3482            .as_fixed_size_list()
3483            .values()
3484            .as_primitive::<Float32Type>()
3485            .values()
3486            .to_vec();
3487        assert_eq!(arr_values.len(), 12);
3488        let expected_values = [1.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0, -1.0, 1.0, 0.0, 0.0, 1.0];
3489        for (actual, expected) in arr_values.iter().zip(expected_values.iter()) {
3490            assert!((actual - expected).abs() < 0.0001);
3491        }
3492    }
3493
3494    #[test]
3495    fn test_jitter_centroids() {
3496        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3497        let mut centroids_gen = array::cycle_unit_circle(4);
3498        let centroids = centroids_gen.generate(RowCount::from(4), &mut rng).unwrap();
3499
3500        let centroid_values = centroids
3501            .as_fixed_size_list()
3502            .values()
3503            .as_primitive::<Float32Type>()
3504            .values()
3505            .to_vec();
3506
3507        let mut jitter_jen = array::jitter_centroids(centroids, 0.001);
3508        let jittered = jitter_jen.generate(RowCount::from(100), &mut rng).unwrap();
3509
3510        let values = jittered
3511            .as_fixed_size_list()
3512            .values()
3513            .as_primitive::<Float32Type>()
3514            .values()
3515            .to_vec();
3516
3517        for i in 0..100 {
3518            let centroid = i % 4;
3519            let centroid_x = centroid_values[centroid * 2];
3520            let centroid_y = centroid_values[centroid * 2 + 1];
3521            let value_x = values[i * 2];
3522            let value_y = values[i * 2 + 1];
3523
3524            let l2_dist = ((value_x - centroid_x).powi(2) + (value_y - centroid_y).powi(2)).sqrt();
3525            assert!(l2_dist < 0.001001);
3526            assert!(l2_dist > 0.000999);
3527        }
3528    }
3529
3530    #[test]
3531    fn test_rand_schema() {
3532        let schema = Schema::new(vec![
3533            Field::new("a", DataType::Int32, true),
3534            Field::new("b", DataType::Utf8, true),
3535            Field::new("c", DataType::Float32, true),
3536            Field::new("d", DataType::Int32, true),
3537            Field::new("e", DataType::Int32, true),
3538        ]);
3539        let rbr = rand(&schema)
3540            .into_reader_bytes(
3541                ByteCount::from(1024 * 1024),
3542                BatchCount::from(8),
3543                RoundingBehavior::ExactOrErr,
3544            )
3545            .unwrap();
3546        assert_eq!(*rbr.schema(), schema);
3547
3548        let batches = rbr.map(|val| val.unwrap()).collect::<Vec<_>>();
3549        assert_eq!(batches.len(), 8);
3550
3551        for batch in batches {
3552            assert_eq!(batch.num_rows(), 1024 * 1024 / 32);
3553            assert_eq!(batch.num_columns(), 5);
3554        }
3555    }
3556}