1use std::{collections::HashMap, iter, marker::PhantomData, sync::Arc};
5
6use arrow::{
7 array::{ArrayData, AsArray, Float32Builder, GenericBinaryBuilder, GenericStringBuilder},
8 buffer::{BooleanBuffer, Buffer, OffsetBuffer, ScalarBuffer},
9 datatypes::{
10 ArrowPrimitiveType, Float32Type, Int32Type, Int64Type, IntervalDayTime,
11 IntervalMonthDayNano, UInt32Type,
12 },
13};
14use arrow_array::{
15 make_array,
16 types::{ArrowDictionaryKeyType, BinaryType, ByteArrayType, Utf8Type},
17 Array, BinaryArray, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, LargeListArray,
18 LargeStringArray, ListArray, NullArray, OffsetSizeTrait, PrimitiveArray, RecordBatch,
19 RecordBatchOptions, RecordBatchReader, StringArray, StructArray,
20};
21use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef};
22use futures::{stream::BoxStream, StreamExt};
23use rand::{distr::Uniform, Rng, RngCore, SeedableRng};
24use random_word;
25
26use self::array::rand_with_distribution;
27
28#[derive(Copy, Clone, Debug, Default)]
29pub struct RowCount(u64);
30#[derive(Copy, Clone, Debug, Default)]
31pub struct BatchCount(u32);
32#[derive(Copy, Clone, Debug, Default)]
33pub struct ByteCount(u64);
34#[derive(Copy, Clone, Debug, Default)]
35pub struct Dimension(u32);
36
37impl From<u32> for BatchCount {
38 fn from(n: u32) -> Self {
39 Self(n)
40 }
41}
42
43impl From<u64> for RowCount {
44 fn from(n: u64) -> Self {
45 Self(n)
46 }
47}
48
49impl From<u64> for ByteCount {
50 fn from(n: u64) -> Self {
51 Self(n)
52 }
53}
54
55impl From<u32> for Dimension {
56 fn from(n: u32) -> Self {
57 Self(n)
58 }
59}
60
61pub trait ArrayGenerator: Send + Sync + std::fmt::Debug {
63 fn generate(
77 &mut self,
78 length: RowCount,
79 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
80 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError>;
81
82 fn generate_default(
92 &mut self,
93 length: RowCount,
94 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
95 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
96 Self::generate(self, length, &mut rng)
97 }
98 fn data_type(&self) -> &DataType;
104 fn metadata(&self) -> Option<HashMap<String, String>> {
106 None
107 }
108 fn element_size_bytes(&self) -> Option<ByteCount>;
114}
115
116#[derive(Debug)]
117pub struct CycleNullGenerator {
118 generator: Box<dyn ArrayGenerator>,
119 validity: Vec<bool>,
120 idx: usize,
121}
122#[derive(Debug)]
123pub struct CycleNanGenerator {
124 generator: Box<dyn ArrayGenerator>,
125 nan_pattern: Vec<bool>,
126 idx: usize,
127}
128
129impl ArrayGenerator for CycleNanGenerator {
130 fn generate(
131 &mut self,
132 length: RowCount,
133 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
134 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
135 let array = self.generator.generate(length, rng)?;
136
137 match array.data_type() {
139 DataType::Float16 => {
140 let float_array = array
141 .as_any()
142 .downcast_ref::<arrow_array::Float16Array>()
143 .unwrap();
144 let mut values: Vec<half::f16> = float_array.values().to_vec();
145
146 for (i, &should_be_nan) in self
147 .nan_pattern
148 .iter()
149 .cycle()
150 .skip(self.idx)
151 .take(length.0 as usize)
152 .enumerate()
153 {
154 if should_be_nan {
155 values[i] = half::f16::NAN;
156 }
157 }
158
159 self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
160 Ok(Arc::new(arrow_array::Float16Array::from(values)))
161 }
162 DataType::Float32 => {
163 let float_array = array
164 .as_any()
165 .downcast_ref::<arrow_array::Float32Array>()
166 .unwrap();
167 let mut values: Vec<f32> = float_array.values().to_vec();
168
169 for (i, &should_be_nan) in self
170 .nan_pattern
171 .iter()
172 .cycle()
173 .skip(self.idx)
174 .take(length.0 as usize)
175 .enumerate()
176 {
177 if should_be_nan {
178 values[i] = f32::NAN;
179 }
180 }
181
182 self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
183 Ok(Arc::new(arrow_array::Float32Array::from(values)))
184 }
185 DataType::Float64 => {
186 let float_array = array
187 .as_any()
188 .downcast_ref::<arrow_array::Float64Array>()
189 .unwrap();
190 let mut values: Vec<f64> = float_array.values().to_vec();
191
192 for (i, &should_be_nan) in self
193 .nan_pattern
194 .iter()
195 .cycle()
196 .skip(self.idx)
197 .take(length.0 as usize)
198 .enumerate()
199 {
200 if should_be_nan {
201 values[i] = f64::NAN;
202 }
203 }
204
205 self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
206 Ok(Arc::new(arrow_array::Float64Array::from(values)))
207 }
208 _ => {
209 Ok(array)
211 }
212 }
213 }
214
215 fn data_type(&self) -> &DataType {
216 self.generator.data_type()
217 }
218
219 fn element_size_bytes(&self) -> Option<ByteCount> {
220 self.generator.element_size_bytes()
221 }
222}
223
224impl ArrayGenerator for CycleNullGenerator {
225 fn generate(
226 &mut self,
227 length: RowCount,
228 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
229 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
230 let array = self.generator.generate(length, rng)?;
231 let data = array.to_data();
232 let validity_itr = self
233 .validity
234 .iter()
235 .cycle()
236 .skip(self.idx)
237 .take(length.0 as usize)
238 .copied();
239 let validity_bitmap = BooleanBuffer::from_iter(validity_itr);
240
241 self.idx = (self.idx + (length.0 as usize)) % self.validity.len();
242 unsafe {
243 let new_data = ArrayData::new_unchecked(
244 data.data_type().clone(),
245 data.len(),
246 None,
247 Some(validity_bitmap.into_inner()),
248 data.offset(),
249 data.buffers().to_vec(),
250 data.child_data().into(),
251 );
252 Ok(make_array(new_data))
253 }
254 }
255
256 fn data_type(&self) -> &DataType {
257 self.generator.data_type()
258 }
259
260 fn element_size_bytes(&self) -> Option<ByteCount> {
261 self.generator.element_size_bytes()
262 }
263}
264
265#[derive(Debug)]
266pub struct MetadataGenerator {
267 generator: Box<dyn ArrayGenerator>,
268 metadata: HashMap<String, String>,
269}
270
271impl ArrayGenerator for MetadataGenerator {
272 fn generate(
273 &mut self,
274 length: RowCount,
275 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
276 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
277 self.generator.generate(length, rng)
278 }
279
280 fn metadata(&self) -> Option<HashMap<String, String>> {
281 Some(self.metadata.clone())
282 }
283
284 fn data_type(&self) -> &DataType {
285 self.generator.data_type()
286 }
287
288 fn element_size_bytes(&self) -> Option<ByteCount> {
289 self.generator.element_size_bytes()
290 }
291}
292
293#[derive(Debug)]
294pub struct NullGenerator {
295 generator: Box<dyn ArrayGenerator>,
296 null_probability: f64,
297}
298
299impl ArrayGenerator for NullGenerator {
300 fn generate(
301 &mut self,
302 length: RowCount,
303 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
304 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
305 let array = self.generator.generate(length, rng)?;
306 let data = array.to_data();
307
308 if self.null_probability < 0.0 || self.null_probability > 1.0 {
309 return Err(ArrowError::InvalidArgumentError(format!(
310 "null_probability must be between 0 and 1, got {}",
311 self.null_probability
312 )));
313 }
314
315 let (null_count, new_validity) = if self.null_probability == 0.0 {
316 if data.null_count() == 0 {
317 return Ok(array);
318 } else {
319 (0_usize, None)
320 }
321 } else if self.null_probability == 1.0 {
322 if data.null_count() == data.len() {
323 return Ok(array);
324 } else {
325 let all_nulls = BooleanBuffer::new_unset(array.len());
326 (array.len(), Some(all_nulls.into_inner()))
327 }
328 } else {
329 let array_len = array.len();
330 let num_validity_bytes = array_len.div_ceil(8);
331 let mut null_count = 0;
332 let threshold = (self.null_probability * u8::MAX as f64) as u8;
335 let bytes = (0..num_validity_bytes)
336 .map(|byte_idx| {
337 let mut sample = rng.random::<u64>();
338 let mut byte: u8 = 0;
339 for bit_idx in 0..8 {
340 byte <<= 1;
343 let pos = byte_idx * 8 + (7 - bit_idx);
344 if pos < array_len {
345 let sample_piece = sample & 0xFF;
346 let is_null = (sample_piece as u8) < threshold;
347 byte |= (!is_null) as u8;
348 null_count += is_null as usize;
349 }
350 sample >>= 8;
351 }
352 byte
353 })
354 .collect::<Vec<_>>();
355 let new_validity = Buffer::from_iter(bytes);
356 (null_count, Some(new_validity))
357 };
358
359 unsafe {
360 let new_data = ArrayData::new_unchecked(
361 data.data_type().clone(),
362 data.len(),
363 Some(null_count),
364 new_validity,
365 data.offset(),
366 data.buffers().to_vec(),
367 data.child_data().into(),
368 );
369 Ok(make_array(new_data))
370 }
371 }
372
373 fn metadata(&self) -> Option<HashMap<String, String>> {
374 self.generator.metadata()
375 }
376
377 fn data_type(&self) -> &DataType {
378 self.generator.data_type()
379 }
380
381 fn element_size_bytes(&self) -> Option<ByteCount> {
382 self.generator.element_size_bytes()
383 }
384}
385
386pub trait ArrayGeneratorExt {
387 fn with_random_nulls(self, null_probability: f64) -> Box<dyn ArrayGenerator>;
389 fn with_nulls(self, nulls: &[bool]) -> Box<dyn ArrayGenerator>;
391 fn with_nans(self, nans: &[bool]) -> Box<dyn ArrayGenerator>;
395 fn with_validity(self, nulls: &[bool]) -> Box<dyn ArrayGenerator>;
397 fn with_metadata(self, metadata: HashMap<String, String>) -> Box<dyn ArrayGenerator>;
398}
399
400impl ArrayGeneratorExt for Box<dyn ArrayGenerator> {
401 fn with_random_nulls(self, null_probability: f64) -> Box<dyn ArrayGenerator> {
402 Box::new(NullGenerator {
403 generator: self,
404 null_probability,
405 })
406 }
407
408 fn with_nulls(self, nulls: &[bool]) -> Box<dyn ArrayGenerator> {
409 Box::new(CycleNullGenerator {
410 generator: self,
411 validity: nulls.iter().map(|v| !*v).collect(),
412 idx: 0,
413 })
414 }
415
416 fn with_nans(self, nans: &[bool]) -> Box<dyn ArrayGenerator> {
417 Box::new(CycleNanGenerator {
418 generator: self,
419 nan_pattern: nans.to_vec(),
420 idx: 0,
421 })
422 }
423
424 fn with_validity(self, validity: &[bool]) -> Box<dyn ArrayGenerator> {
425 Box::new(CycleNullGenerator {
426 generator: self,
427 validity: validity.to_vec(),
428 idx: 0,
429 })
430 }
431
432 fn with_metadata(self, metadata: HashMap<String, String>) -> Box<dyn ArrayGenerator> {
433 Box::new(MetadataGenerator {
434 generator: self,
435 metadata,
436 })
437 }
438}
439
440pub struct NTimesIter<I: Iterator>
441where
442 I::Item: Copy,
443{
444 iter: I,
445 n: u32,
446 cur: I::Item,
447 count: u32,
448}
449
450impl<I: Iterator> Iterator for NTimesIter<I>
456where
457 I::Item: Copy,
458{
459 type Item = I::Item;
460
461 fn next(&mut self) -> Option<Self::Item> {
462 if self.count == 0 {
463 self.count = self.n - 1;
464 self.cur = self.iter.next()?;
465 } else {
466 self.count -= 1;
467 }
468 Some(self.cur)
469 }
470
471 fn size_hint(&self) -> (usize, Option<usize>) {
472 let (lower, upper) = self.iter.size_hint();
473 let lower = lower * self.n as usize;
474 let upper = upper.map(|u| u * self.n as usize);
475 (lower, upper)
476 }
477}
478
479pub struct FnGen<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T>
480where
481 T: Copy + Default,
482 ArrayType: arrow_array::Array + From<Vec<T>>,
483{
484 data_type: DataType,
485 generator: F,
486 array_type: PhantomData<ArrayType>,
487 repeat: u32,
488 leftover: T,
489 leftover_count: u32,
490 element_size_bytes: Option<ByteCount>,
491}
492
493impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> std::fmt::Debug
494 for FnGen<T, ArrayType, F>
495where
496 T: Copy + Default,
497 ArrayType: arrow_array::Array + From<Vec<T>>,
498{
499 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
500 f.debug_struct("FnGen")
501 .field("data_type", &self.data_type)
502 .field("array_type", &self.array_type)
503 .field("repeat", &self.repeat)
504 .field("leftover_count", &self.leftover_count)
505 .field("element_size_bytes", &self.element_size_bytes)
506 .finish()
507 }
508}
509
510impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> FnGen<T, ArrayType, F>
511where
512 T: Copy + Default,
513 ArrayType: arrow_array::Array + From<Vec<T>>,
514{
515 fn new_known_size(
516 data_type: DataType,
517 generator: F,
518 repeat: u32,
519 element_size_bytes: ByteCount,
520 ) -> Self {
521 Self {
522 data_type,
523 generator,
524 array_type: PhantomData,
525 repeat,
526 leftover: T::default(),
527 leftover_count: 0,
528 element_size_bytes: Some(element_size_bytes),
529 }
530 }
531
532 fn new_unknown_size(data_type: DataType, generator: F, repeat: u32) -> Self {
533 Self {
534 data_type,
535 generator,
536 array_type: PhantomData,
537 repeat,
538 leftover: T::default(),
539 leftover_count: 0,
540 element_size_bytes: None,
541 }
542 }
543}
544
545impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> ArrayGenerator
546 for FnGen<T, ArrayType, F>
547where
548 T: Copy + Default + Send + Sync,
549 ArrayType: arrow_array::Array + From<Vec<T>> + 'static,
550 F: Send + Sync,
551{
552 fn generate(
553 &mut self,
554 length: RowCount,
555 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
556 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
557 let iter = (0..length.0).map(|_| (self.generator)(rng));
558 let values = if self.repeat > 1 {
559 Vec::from_iter(
560 NTimesIter {
561 iter,
562 n: self.repeat,
563 cur: self.leftover,
564 count: self.leftover_count,
565 }
566 .take(length.0 as usize),
567 )
568 } else {
569 Vec::from_iter(iter)
570 };
571 self.leftover_count = ((self.leftover_count as u64 + length.0) % self.repeat as u64) as u32;
572 self.leftover = values.last().copied().unwrap_or(T::default());
573 Ok(Arc::new(ArrayType::from(values)))
574 }
575
576 fn data_type(&self) -> &DataType {
577 &self.data_type
578 }
579
580 fn element_size_bytes(&self) -> Option<ByteCount> {
581 self.element_size_bytes
582 }
583}
584
585#[derive(Copy, Clone, Debug)]
586pub struct Seed(pub u64);
587pub const DEFAULT_SEED: Seed = Seed(42);
588
589impl From<u64> for Seed {
590 fn from(n: u64) -> Self {
591 Self(n)
592 }
593}
594
595#[derive(Debug)]
596pub struct CycleVectorGenerator {
597 underlying_gen: Box<dyn ArrayGenerator>,
598 dimension: Dimension,
599 data_type: DataType,
600}
601
602impl CycleVectorGenerator {
603 pub fn new(underlying_gen: Box<dyn ArrayGenerator>, dimension: Dimension) -> Self {
604 let data_type = DataType::FixedSizeList(
605 Arc::new(Field::new("item", underlying_gen.data_type().clone(), true)),
606 dimension.0 as i32,
607 );
608 Self {
609 underlying_gen,
610 dimension,
611 data_type,
612 }
613 }
614}
615
616impl ArrayGenerator for CycleVectorGenerator {
617 fn generate(
618 &mut self,
619 length: RowCount,
620 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
621 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
622 let values = self
623 .underlying_gen
624 .generate(RowCount::from(length.0 * self.dimension.0 as u64), rng)?;
625 let field = Arc::new(Field::new("item", values.data_type().clone(), true));
626 let values = Arc::new(values);
627
628 let array = FixedSizeListArray::try_new(field, self.dimension.0 as i32, values, None)?;
629
630 Ok(Arc::new(array))
631 }
632
633 fn data_type(&self) -> &DataType {
634 &self.data_type
635 }
636
637 fn element_size_bytes(&self) -> Option<ByteCount> {
638 self.underlying_gen
639 .element_size_bytes()
640 .map(|byte_count| ByteCount::from(byte_count.0 * self.dimension.0 as u64))
641 }
642}
643
644#[derive(Debug)]
645pub struct CycleListGenerator {
646 underlying_gen: Box<dyn ArrayGenerator>,
647 lengths_gen: Box<dyn ArrayGenerator>,
648 data_type: DataType,
649}
650
651impl CycleListGenerator {
652 pub fn new(
653 underlying_gen: Box<dyn ArrayGenerator>,
654 min_list_size: Dimension,
655 max_list_size: Dimension,
656 ) -> Self {
657 let data_type = DataType::List(Arc::new(Field::new(
658 "item",
659 underlying_gen.data_type().clone(),
660 true,
661 )));
662 let lengths_dist = Uniform::new(min_list_size.0, max_list_size.0).unwrap();
663 let lengths_gen = rand_with_distribution::<UInt32Type, Uniform<u32>>(lengths_dist);
664 Self {
665 underlying_gen,
666 lengths_gen,
667 data_type,
668 }
669 }
670}
671
672impl ArrayGenerator for CycleListGenerator {
673 fn generate(
674 &mut self,
675 length: RowCount,
676 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
677 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
678 let lengths = self.lengths_gen.generate(length, rng)?;
679 let lengths = lengths.as_primitive::<UInt32Type>();
680 let total_length = lengths.values().iter().map(|i| *i as u64).sum::<u64>();
681 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
682 let values = self
683 .underlying_gen
684 .generate(RowCount::from(total_length), rng)?;
685 let field = Arc::new(Field::new("item", values.data_type().clone(), true));
686 let values = Arc::new(values);
687
688 let array = ListArray::try_new(field, offsets, values, None)?;
689
690 Ok(Arc::new(array))
691 }
692
693 fn data_type(&self) -> &DataType {
694 &self.data_type
695 }
696
697 fn element_size_bytes(&self) -> Option<ByteCount> {
698 None
699 }
700}
701
702#[derive(Debug, Default)]
703pub struct PseudoUuidGenerator {}
704
705impl ArrayGenerator for PseudoUuidGenerator {
706 fn generate(
707 &mut self,
708 length: RowCount,
709 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
710 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
711 Ok(Arc::new(FixedSizeBinaryArray::try_from_iter(
712 (0..length.0).map(|_| {
713 let mut data = vec![0; 16];
714 rng.fill_bytes(&mut data);
715 data
716 }),
717 )?))
718 }
719
720 fn data_type(&self) -> &DataType {
721 &DataType::FixedSizeBinary(16)
722 }
723
724 fn element_size_bytes(&self) -> Option<ByteCount> {
725 Some(ByteCount::from(16))
726 }
727}
728
729#[derive(Debug, Default)]
730pub struct PseudoUuidHexGenerator {}
731
732impl ArrayGenerator for PseudoUuidHexGenerator {
733 fn generate(
734 &mut self,
735 length: RowCount,
736 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
737 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
738 let mut data = vec![0; 16 * length.0 as usize];
739 rng.fill_bytes(&mut data);
740 let data_hex = hex::encode(data);
741
742 Ok(Arc::new(StringArray::from_iter_values(
743 (0..length.0 as usize).map(|i| data_hex.get(i * 32..(i + 1) * 32).unwrap()),
744 )))
745 }
746
747 fn data_type(&self) -> &DataType {
748 &DataType::Utf8
749 }
750
751 fn element_size_bytes(&self) -> Option<ByteCount> {
752 Some(ByteCount::from(16))
753 }
754}
755
756#[derive(Debug, Default)]
757pub struct RandomBooleanGenerator {}
758
759impl ArrayGenerator for RandomBooleanGenerator {
760 fn generate(
761 &mut self,
762 length: RowCount,
763 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
764 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
765 let num_bytes = length.0.div_ceil(8);
766 let mut bytes = vec![0; num_bytes as usize];
767 rng.fill_bytes(&mut bytes);
768 let bytes = BooleanBuffer::new(Buffer::from(bytes), 0, length.0 as usize);
769 Ok(Arc::new(arrow_array::BooleanArray::new(bytes, None)))
770 }
771
772 fn data_type(&self) -> &DataType {
773 &DataType::Boolean
774 }
775
776 fn element_size_bytes(&self) -> Option<ByteCount> {
777 None
780 }
781}
782
783pub struct RandomBytesGenerator<T: ArrowPrimitiveType + Send + Sync> {
786 phantom: PhantomData<T>,
787 data_type: DataType,
788}
789
790impl<T: ArrowPrimitiveType + Send + Sync> std::fmt::Debug for RandomBytesGenerator<T> {
791 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
792 f.debug_struct("RandomBytesGenerator")
793 .field("data_type", &self.data_type)
794 .finish()
795 }
796}
797
798impl<T: ArrowPrimitiveType + Send + Sync> RandomBytesGenerator<T> {
799 fn new(data_type: DataType) -> Self {
800 Self {
801 phantom: Default::default(),
802 data_type,
803 }
804 }
805
806 fn byte_width() -> Result<u64, ArrowError> {
807 T::DATA_TYPE.primitive_width().ok_or_else(|| ArrowError::InvalidArgumentError(format!("Cannot generate the data type {} with the RandomBytesGenerator because it is not a fixed-width bytes type", T::DATA_TYPE))).map(|val| val as u64)
808 }
809}
810
811impl<T: ArrowPrimitiveType + Send + Sync> ArrayGenerator for RandomBytesGenerator<T> {
812 fn generate(
813 &mut self,
814 length: RowCount,
815 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
816 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
817 let num_bytes = length.0 * Self::byte_width()?;
818 let mut bytes = vec![0; num_bytes as usize];
819 rng.fill_bytes(&mut bytes);
820 let bytes = ScalarBuffer::new(Buffer::from(bytes), 0, length.0 as usize);
821 Ok(Arc::new(
822 PrimitiveArray::<T>::new(bytes, None).with_data_type(self.data_type.clone()),
823 ))
824 }
825
826 fn data_type(&self) -> &DataType {
827 &self.data_type
828 }
829
830 fn element_size_bytes(&self) -> Option<ByteCount> {
831 Self::byte_width().map(ByteCount::from).ok()
832 }
833}
834
835#[derive(Debug)]
838pub struct RandomFixedSizeBinaryGenerator {
839 data_type: DataType,
840 size: i32,
841}
842
843impl RandomFixedSizeBinaryGenerator {
844 fn new(size: i32) -> Self {
845 Self {
846 size,
847 data_type: DataType::FixedSizeBinary(size),
848 }
849 }
850}
851
852impl ArrayGenerator for RandomFixedSizeBinaryGenerator {
853 fn generate(
854 &mut self,
855 length: RowCount,
856 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
857 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
858 let num_bytes = length.0 * self.size as u64;
859 let mut bytes = vec![0; num_bytes as usize];
860 rng.fill_bytes(&mut bytes);
861 Ok(Arc::new(FixedSizeBinaryArray::new(
862 self.size,
863 Buffer::from(bytes),
864 None,
865 )))
866 }
867
868 fn data_type(&self) -> &DataType {
869 &self.data_type
870 }
871
872 fn element_size_bytes(&self) -> Option<ByteCount> {
873 Some(ByteCount::from(self.size as u64))
874 }
875}
876
877#[derive(Debug)]
878pub struct RandomIntervalGenerator {
879 unit: IntervalUnit,
880 data_type: DataType,
881}
882
883impl RandomIntervalGenerator {
884 pub fn new(unit: IntervalUnit) -> Self {
885 Self {
886 unit,
887 data_type: DataType::Interval(unit),
888 }
889 }
890}
891
892impl ArrayGenerator for RandomIntervalGenerator {
893 fn generate(
894 &mut self,
895 length: RowCount,
896 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
897 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
898 match self.unit {
899 IntervalUnit::YearMonth => {
900 let months = (0..length.0)
901 .map(|_| rng.random::<i32>())
902 .collect::<Vec<_>>();
903 Ok(Arc::new(arrow_array::IntervalYearMonthArray::from(months)))
904 }
905 IntervalUnit::MonthDayNano => {
906 let day_time_array = (0..length.0)
907 .map(|_| IntervalMonthDayNano::new(rng.random(), rng.random(), rng.random()))
908 .collect::<Vec<_>>();
909 Ok(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(
910 day_time_array,
911 )))
912 }
913 IntervalUnit::DayTime => {
914 let day_time_array = (0..length.0)
915 .map(|_| IntervalDayTime::new(rng.random(), rng.random()))
916 .collect::<Vec<_>>();
917 Ok(Arc::new(arrow_array::IntervalDayTimeArray::from(
918 day_time_array,
919 )))
920 }
921 }
922 }
923
924 fn data_type(&self) -> &DataType {
925 &self.data_type
926 }
927
928 fn element_size_bytes(&self) -> Option<ByteCount> {
929 Some(ByteCount::from(12))
930 }
931}
932#[derive(Debug)]
933pub struct RandomBinaryGenerator {
934 bytes_per_element: ByteCount,
935 scale_to_utf8: bool,
936 is_large: bool,
937 data_type: DataType,
938}
939
940impl RandomBinaryGenerator {
941 pub fn new(bytes_per_element: ByteCount, scale_to_utf8: bool, is_large: bool) -> Self {
942 Self {
943 bytes_per_element,
944 scale_to_utf8,
945 is_large,
946 data_type: match (scale_to_utf8, is_large) {
947 (false, false) => DataType::Binary,
948 (false, true) => DataType::LargeBinary,
949 (true, false) => DataType::Utf8,
950 (true, true) => DataType::LargeUtf8,
951 },
952 }
953 }
954}
955
956impl ArrayGenerator for RandomBinaryGenerator {
957 fn generate(
958 &mut self,
959 length: RowCount,
960 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
961 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
962 let mut bytes = vec![0; (self.bytes_per_element.0 * length.0) as usize];
963 rng.fill_bytes(&mut bytes);
964 if self.scale_to_utf8 {
965 bytes = bytes.into_iter().map(|val| (val % 95) + 32).collect();
968 }
969 let bytes = Buffer::from(bytes);
970 if self.is_large {
971 let offsets = OffsetBuffer::from_lengths(iter::repeat_n(
972 self.bytes_per_element.0 as usize,
973 length.0 as usize,
974 ));
975 if self.scale_to_utf8 {
976 unsafe {
978 Ok(Arc::new(arrow_array::LargeStringArray::new_unchecked(
979 offsets, bytes, None,
980 )))
981 }
982 } else {
983 unsafe {
984 Ok(Arc::new(arrow_array::LargeBinaryArray::new_unchecked(
985 offsets, bytes, None,
986 )))
987 }
988 }
989 } else {
990 let offsets = OffsetBuffer::from_lengths(iter::repeat_n(
991 self.bytes_per_element.0 as usize,
992 length.0 as usize,
993 ));
994 if self.scale_to_utf8 {
995 unsafe {
997 Ok(Arc::new(arrow_array::StringArray::new_unchecked(
998 offsets, bytes, None,
999 )))
1000 }
1001 } else {
1002 unsafe {
1003 Ok(Arc::new(arrow_array::BinaryArray::new_unchecked(
1004 offsets, bytes, None,
1005 )))
1006 }
1007 }
1008 }
1009 }
1010
1011 fn data_type(&self) -> &DataType {
1012 &self.data_type
1013 }
1014
1015 fn element_size_bytes(&self) -> Option<ByteCount> {
1016 Some(ByteCount::from(
1018 self.bytes_per_element.0 + std::mem::size_of::<i32>() as u64,
1019 ))
1020 }
1021}
1022
1023#[derive(Debug)]
1027pub struct PrefixPlusCounterGenerator {
1028 prefix: String,
1029 is_large: bool,
1030 data_type: DataType,
1031 current_counter: u64,
1032}
1033
1034impl PrefixPlusCounterGenerator {
1035 pub fn new(prefix: String, is_large: bool) -> Self {
1036 Self {
1037 prefix,
1038 is_large,
1039 data_type: if is_large {
1040 DataType::LargeUtf8
1041 } else {
1042 DataType::Utf8
1043 },
1044 current_counter: 0,
1045 }
1046 }
1047
1048 fn generate_values<T: OffsetSizeTrait>(
1049 &self,
1050 start: u64,
1051 num_values: u64,
1052 ) -> Result<Arc<dyn Array>, ArrowError> {
1053 let max_counter = start + num_values;
1054 let max_digits_per_counter = (max_counter as f64).log10().ceil() as u64;
1055 let max_bytes_per_str = max_digits_per_counter + self.prefix.len() as u64;
1056 let max_bytes = max_bytes_per_str * num_values;
1057 let mut builder =
1058 GenericStringBuilder::<T>::with_capacity(num_values as usize, max_bytes as usize);
1059 let mut word = String::with_capacity(max_bytes_per_str as usize);
1060 word.push_str(&self.prefix);
1061 for i in 0..num_values {
1062 let counter = start + i;
1063 word.truncate(self.prefix.len());
1064 word.push_str(&counter.to_string());
1065 builder.append_value(&word);
1066 }
1067 Ok(Arc::new(builder.finish()))
1068 }
1069}
1070
1071impl ArrayGenerator for PrefixPlusCounterGenerator {
1072 fn generate(
1073 &mut self,
1074 length: RowCount,
1075 _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1076 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1077 let start = self.current_counter;
1078 self.current_counter += length.0;
1079 if self.is_large {
1080 self.generate_values::<i64>(start, length.0)
1081 } else {
1082 self.generate_values::<i32>(start, length.0)
1083 }
1084 }
1085
1086 fn data_type(&self) -> &DataType {
1087 &self.data_type
1088 }
1089
1090 fn element_size_bytes(&self) -> Option<ByteCount> {
1091 None
1093 }
1094}
1095
1096#[derive(Debug)]
1102pub struct BinaryPrefixPlusCounterGenerator {
1103 prefix: Arc<[u8]>,
1104 is_large: bool,
1105 data_type: DataType,
1106 current_counter: u64,
1107}
1108
1109impl BinaryPrefixPlusCounterGenerator {
1110 pub fn new(prefix: Arc<[u8]>, is_large: bool) -> Self {
1111 Self {
1112 prefix,
1113 is_large,
1114 data_type: if is_large {
1115 DataType::LargeBinary
1116 } else {
1117 DataType::Binary
1118 },
1119 current_counter: 0,
1120 }
1121 }
1122
1123 fn generate_values<T: OffsetSizeTrait>(
1124 &self,
1125 start: u64,
1126 num_values: u64,
1127 ) -> Result<Arc<dyn Array>, ArrowError> {
1128 let max_bytes = (self.prefix.len() + std::mem::size_of::<u64>()) * num_values as usize;
1129 let mut builder = GenericBinaryBuilder::<T>::with_capacity(num_values as usize, max_bytes);
1130 let mut word = Vec::with_capacity(self.prefix.len() + std::mem::size_of::<u64>());
1131 word.extend_from_slice(&self.prefix);
1132 for i in 0..num_values {
1133 let counter = start + i;
1134 word.truncate(self.prefix.len());
1135 if counter < u8::MAX as u64 {
1136 word.push(counter as u8);
1137 } else if counter < u16::MAX as u64 {
1138 word.extend_from_slice(&(counter as u16).to_le_bytes());
1139 } else if counter < u32::MAX as u64 {
1140 word.extend_from_slice(&(counter as u32).to_le_bytes());
1141 } else {
1142 word.extend_from_slice(&counter.to_le_bytes());
1143 }
1144 builder.append_value(&word);
1145 }
1146 Ok(Arc::new(builder.finish()))
1147 }
1148}
1149
1150impl ArrayGenerator for BinaryPrefixPlusCounterGenerator {
1151 fn generate(
1152 &mut self,
1153 length: RowCount,
1154 _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1155 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1156 let start = self.current_counter;
1157 self.current_counter += length.0;
1158 if self.is_large {
1159 self.generate_values::<i64>(start, length.0)
1160 } else {
1161 self.generate_values::<i32>(start, length.0)
1162 }
1163 }
1164
1165 fn data_type(&self) -> &DataType {
1166 &self.data_type
1167 }
1168
1169 fn element_size_bytes(&self) -> Option<ByteCount> {
1170 None
1172 }
1173}
1174
1175#[derive(Debug)]
1176struct RandomSentenceGenerator {
1177 min_words: usize,
1178 max_words: usize,
1179 words: &'static [&'static str],
1180 is_large: bool,
1181}
1182
1183impl RandomSentenceGenerator {
1184 pub fn new(min_words: usize, max_words: usize, is_large: bool) -> Self {
1185 let words = random_word::all(random_word::Lang::En);
1186 Self {
1187 min_words,
1188 max_words,
1189 words,
1190 is_large,
1191 }
1192 }
1193}
1194
1195impl ArrayGenerator for RandomSentenceGenerator {
1196 fn generate(
1197 &mut self,
1198 length: RowCount,
1199 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1200 ) -> Result<Arc<dyn Array>, ArrowError> {
1201 let mut values = Vec::with_capacity(length.0 as usize);
1202
1203 for _ in 0..length.0 {
1204 let num_words = rng.random_range(self.min_words..=self.max_words);
1205 let sentence: String = (0..num_words)
1206 .map(|_| self.words[rng.random_range(0..self.words.len())])
1207 .collect::<Vec<_>>()
1208 .join(" ");
1209 values.push(sentence);
1210 }
1211
1212 if self.is_large {
1213 Ok(Arc::new(LargeStringArray::from(values)))
1214 } else {
1215 Ok(Arc::new(StringArray::from(values)))
1216 }
1217 }
1218
1219 fn data_type(&self) -> &DataType {
1220 if self.is_large {
1221 &DataType::LargeUtf8
1222 } else {
1223 &DataType::Utf8
1224 }
1225 }
1226
1227 fn element_size_bytes(&self) -> Option<ByteCount> {
1228 let avg_word_length = 6;
1231 let avg_words = (self.min_words + self.max_words) / 2;
1232 Some(ByteCount::from((avg_word_length * avg_words) as u64))
1233 }
1234}
1235
1236#[derive(Debug)]
1237struct RandomWordGenerator {
1238 words: &'static [&'static str],
1239 is_large: bool,
1240}
1241
1242impl RandomWordGenerator {
1243 pub fn new(is_large: bool) -> Self {
1244 let words = random_word::all(random_word::Lang::En);
1245 Self { words, is_large }
1246 }
1247}
1248
1249impl ArrayGenerator for RandomWordGenerator {
1250 fn generate(
1251 &mut self,
1252 length: RowCount,
1253 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1254 ) -> Result<Arc<dyn Array>, ArrowError> {
1255 let mut values = Vec::with_capacity(length.0 as usize);
1256
1257 for _ in 0..length.0 {
1258 let word = self.words[rng.random_range(0..self.words.len())];
1259 values.push(word.to_string());
1260 }
1261
1262 if self.is_large {
1263 Ok(Arc::new(LargeStringArray::from(values)))
1264 } else {
1265 Ok(Arc::new(StringArray::from(values)))
1266 }
1267 }
1268
1269 fn data_type(&self) -> &DataType {
1270 if self.is_large {
1271 &DataType::LargeUtf8
1272 } else {
1273 &DataType::Utf8
1274 }
1275 }
1276
1277 fn element_size_bytes(&self) -> Option<ByteCount> {
1278 Some(ByteCount::from(5))
1280 }
1281}
1282
1283#[derive(Debug)]
1284pub struct VariableRandomBinaryGenerator {
1285 lengths_gen: Box<dyn ArrayGenerator>,
1286 data_type: DataType,
1287}
1288
1289impl VariableRandomBinaryGenerator {
1290 pub fn new(min_bytes_per_element: ByteCount, max_bytes_per_element: ByteCount) -> Self {
1291 let lengths_dist = Uniform::new_inclusive(
1292 min_bytes_per_element.0 as i32,
1293 max_bytes_per_element.0 as i32,
1294 )
1295 .unwrap();
1296 let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist);
1297
1298 Self {
1299 lengths_gen,
1300 data_type: DataType::Binary,
1301 }
1302 }
1303}
1304
1305impl ArrayGenerator for VariableRandomBinaryGenerator {
1306 fn generate(
1307 &mut self,
1308 length: RowCount,
1309 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1310 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1311 let lengths = self.lengths_gen.generate(length, rng)?;
1312 let lengths = lengths.as_primitive::<Int32Type>();
1313 let total_length = lengths.values().iter().map(|i| *i as usize).sum::<usize>();
1314 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1315 let mut bytes = vec![0; total_length];
1316 rng.fill_bytes(&mut bytes);
1317 let bytes = Buffer::from(bytes);
1318 Ok(Arc::new(BinaryArray::try_new(offsets, bytes, None)?))
1319 }
1320
1321 fn data_type(&self) -> &DataType {
1322 &self.data_type
1323 }
1324
1325 fn element_size_bytes(&self) -> Option<ByteCount> {
1326 None
1327 }
1328}
1329
1330pub struct CycleBinaryGenerator<T: ByteArrayType> {
1331 values: Vec<u8>,
1332 lengths: Vec<usize>,
1333 data_type: DataType,
1334 array_type: PhantomData<T>,
1335 width: Option<ByteCount>,
1336 idx: usize,
1337}
1338
1339impl<T: ByteArrayType> std::fmt::Debug for CycleBinaryGenerator<T> {
1340 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1341 f.debug_struct("CycleBinaryGenerator")
1342 .field("values", &self.values)
1343 .field("lengths", &self.lengths)
1344 .field("data_type", &self.data_type)
1345 .field("width", &self.width)
1346 .field("idx", &self.idx)
1347 .finish()
1348 }
1349}
1350
1351impl<T: ByteArrayType> CycleBinaryGenerator<T> {
1352 pub fn from_strings(values: &[&str]) -> Self {
1353 if values.is_empty() {
1354 panic!("Attempt to create a cycle generator with no values");
1355 }
1356 let lengths = values.iter().map(|s| s.len()).collect::<Vec<_>>();
1357 let typical_length = lengths[0];
1358 let width = if lengths.iter().all(|item| *item == typical_length) {
1359 Some(ByteCount::from(
1360 typical_length as u64 + std::mem::size_of::<i32>() as u64,
1361 ))
1362 } else {
1363 None
1364 };
1365 let values = values
1366 .iter()
1367 .flat_map(|s| s.as_bytes().iter().copied())
1368 .collect::<Vec<_>>();
1369 Self {
1370 values,
1371 lengths,
1372 data_type: T::DATA_TYPE,
1373 array_type: PhantomData,
1374 width,
1375 idx: 0,
1376 }
1377 }
1378}
1379
1380impl<T: ByteArrayType> ArrayGenerator for CycleBinaryGenerator<T> {
1381 fn generate(
1382 &mut self,
1383 length: RowCount,
1384 _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1385 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1386 let lengths = self
1387 .lengths
1388 .iter()
1389 .copied()
1390 .cycle()
1391 .skip(self.idx)
1392 .take(length.0 as usize);
1393 let num_bytes = lengths.clone().sum();
1394 let byte_offset = self.lengths[0..self.idx].iter().sum();
1395 let bytes = self
1396 .values
1397 .iter()
1398 .cycle()
1399 .skip(byte_offset)
1400 .copied()
1401 .take(num_bytes)
1402 .collect::<Vec<_>>();
1403 let bytes = Buffer::from(bytes);
1404 let offsets = OffsetBuffer::from_lengths(lengths);
1405 self.idx = (self.idx + length.0 as usize) % self.lengths.len();
1406 Ok(Arc::new(arrow_array::GenericByteArray::<T>::new(
1407 offsets, bytes, None,
1408 )))
1409 }
1410
1411 fn data_type(&self) -> &DataType {
1412 &self.data_type
1413 }
1414
1415 fn element_size_bytes(&self) -> Option<ByteCount> {
1416 self.width
1417 }
1418}
1419
1420pub struct FixedBinaryGenerator<T: ByteArrayType> {
1421 value: Vec<u8>,
1422 data_type: DataType,
1423 array_type: PhantomData<T>,
1424}
1425
1426impl<T: ByteArrayType> std::fmt::Debug for FixedBinaryGenerator<T> {
1427 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1428 f.debug_struct("FixedBinaryGenerator")
1429 .field("value", &self.value)
1430 .field("data_type", &self.data_type)
1431 .finish()
1432 }
1433}
1434
1435impl<T: ByteArrayType> FixedBinaryGenerator<T> {
1436 pub fn new(value: Vec<u8>) -> Self {
1437 Self {
1438 value,
1439 data_type: T::DATA_TYPE,
1440 array_type: PhantomData,
1441 }
1442 }
1443}
1444
1445impl<T: ByteArrayType> ArrayGenerator for FixedBinaryGenerator<T> {
1446 fn generate(
1447 &mut self,
1448 length: RowCount,
1449 _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1450 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1451 let bytes = Buffer::from(Vec::from_iter(
1452 self.value
1453 .iter()
1454 .cycle()
1455 .take((length.0 * self.value.len() as u64) as usize)
1456 .copied(),
1457 ));
1458 let offsets =
1459 OffsetBuffer::from_lengths(iter::repeat_n(self.value.len(), length.0 as usize));
1460 Ok(Arc::new(arrow_array::GenericByteArray::<T>::new(
1461 offsets, bytes, None,
1462 )))
1463 }
1464
1465 fn data_type(&self) -> &DataType {
1466 &self.data_type
1467 }
1468
1469 fn element_size_bytes(&self) -> Option<ByteCount> {
1470 Some(ByteCount::from(
1472 self.value.len() as u64 + std::mem::size_of::<i32>() as u64,
1473 ))
1474 }
1475}
1476
1477pub struct DictionaryGenerator<K: ArrowDictionaryKeyType> {
1478 generator: Box<dyn ArrayGenerator>,
1479 data_type: DataType,
1480 key_type: PhantomData<K>,
1481 key_width: u64,
1482}
1483
1484impl<K: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryGenerator<K> {
1485 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1486 f.debug_struct("DictionaryGenerator")
1487 .field("generator", &self.generator)
1488 .field("data_type", &self.data_type)
1489 .field("key_width", &self.key_width)
1490 .finish()
1491 }
1492}
1493
1494impl<K: ArrowDictionaryKeyType> DictionaryGenerator<K> {
1495 fn new(generator: Box<dyn ArrayGenerator>) -> Self {
1496 let key_type = Box::new(K::DATA_TYPE);
1497 let key_width = key_type
1498 .primitive_width()
1499 .expect("dictionary key types should have a known width")
1500 as u64;
1501 let val_type = Box::new(generator.data_type().clone());
1502 let dict_type = DataType::Dictionary(key_type, val_type);
1503 Self {
1504 generator,
1505 data_type: dict_type,
1506 key_type: PhantomData,
1507 key_width,
1508 }
1509 }
1510}
1511
1512impl<K: ArrowDictionaryKeyType + Send + Sync> ArrayGenerator for DictionaryGenerator<K> {
1513 fn generate(
1514 &mut self,
1515 length: RowCount,
1516 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1517 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1518 let underlying = self.generator.generate(length, rng)?;
1519 arrow_cast::cast::cast(&underlying, &self.data_type)
1520 }
1521
1522 fn data_type(&self) -> &DataType {
1523 &self.data_type
1524 }
1525
1526 fn element_size_bytes(&self) -> Option<ByteCount> {
1527 self.generator
1528 .element_size_bytes()
1529 .map(|size_bytes| ByteCount::from(size_bytes.0 + self.key_width))
1530 }
1531}
1532
1533#[derive(Debug)]
1534struct RandomListGenerator {
1535 field: Arc<Field>,
1536 child_field: Arc<Field>,
1537 items_gen: Box<dyn ArrayGenerator>,
1538 lengths_gen: Box<dyn ArrayGenerator>,
1539 is_large: bool,
1540}
1541
1542impl RandomListGenerator {
1543 fn new(items_gen: Box<dyn ArrayGenerator>, is_large: bool) -> Self {
1545 let child_field = Arc::new(Field::new("item", items_gen.data_type().clone(), true));
1546 let list_type = if is_large {
1547 DataType::LargeList(child_field.clone())
1548 } else {
1549 DataType::List(child_field.clone())
1550 };
1551 let field = Field::new("", list_type, true);
1552 let lengths_gen = if is_large {
1553 let lengths_dist = Uniform::new_inclusive(0, 10).unwrap();
1554 rand_with_distribution::<Int64Type, Uniform<i64>>(lengths_dist)
1555 } else {
1556 let lengths_dist = Uniform::new_inclusive(0, 10).unwrap();
1557 rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist)
1558 };
1559 Self {
1560 field: Arc::new(field),
1561 child_field,
1562 items_gen,
1563 lengths_gen,
1564 is_large,
1565 }
1566 }
1567}
1568
1569impl ArrayGenerator for RandomListGenerator {
1570 fn generate(
1571 &mut self,
1572 length: RowCount,
1573 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1574 ) -> Result<Arc<dyn Array>, ArrowError> {
1575 let lengths = self.lengths_gen.generate(length, rng)?;
1576 if self.is_large {
1577 let lengths = lengths.as_primitive::<Int64Type>();
1578 let total_length = lengths.values().iter().sum::<i64>() as u64;
1579 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1580 let items = self.items_gen.generate(RowCount::from(total_length), rng)?;
1581 Ok(Arc::new(LargeListArray::try_new(
1582 self.child_field.clone(),
1583 offsets,
1584 items,
1585 None,
1586 )?))
1587 } else {
1588 let lengths = lengths.as_primitive::<Int32Type>();
1589 let total_length = lengths.values().iter().sum::<i32>() as u64;
1590 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1591 let items = self.items_gen.generate(RowCount::from(total_length), rng)?;
1592 Ok(Arc::new(ListArray::try_new(
1593 self.child_field.clone(),
1594 offsets,
1595 items,
1596 None,
1597 )?))
1598 }
1599 }
1600
1601 fn data_type(&self) -> &DataType {
1602 self.field.data_type()
1603 }
1604
1605 fn element_size_bytes(&self) -> Option<ByteCount> {
1606 None
1607 }
1608}
1609
1610#[derive(Debug)]
1611struct NullArrayGenerator {}
1612
1613impl ArrayGenerator for NullArrayGenerator {
1614 fn generate(
1615 &mut self,
1616 length: RowCount,
1617 _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1618 ) -> Result<Arc<dyn Array>, ArrowError> {
1619 Ok(Arc::new(NullArray::new(length.0 as usize)))
1620 }
1621
1622 fn data_type(&self) -> &DataType {
1623 &DataType::Null
1624 }
1625
1626 fn element_size_bytes(&self) -> Option<ByteCount> {
1627 None
1628 }
1629}
1630
1631#[derive(Debug)]
1633struct RadialStepGenerator {
1634 num_steps_per_circle: u32,
1635 data_field: Arc<Field>,
1636 data_type: DataType,
1637 current_step: u32,
1638}
1639
1640impl RadialStepGenerator {
1641 fn new(num_steps_per_circle: u32) -> Self {
1642 let data_field = Arc::new(Field::new("item", DataType::Float32, false));
1643 let data_type = DataType::FixedSizeList(data_field.clone(), 2);
1644 Self {
1645 num_steps_per_circle,
1646 data_field,
1647 data_type,
1648 current_step: 0,
1649 }
1650 }
1651}
1652
1653impl ArrayGenerator for RadialStepGenerator {
1654 fn generate(
1655 &mut self,
1656 length: RowCount,
1657 _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1658 ) -> Result<Arc<dyn Array>, ArrowError> {
1659 let mut values_builder = Float32Builder::with_capacity(length.0 as usize * 2);
1660 for _ in 0..length.0 {
1661 let angle = (self.current_step as f32) / (self.num_steps_per_circle as f32)
1662 * 2.0
1663 * std::f32::consts::PI;
1664 values_builder.append_value(angle.cos());
1665 values_builder.append_value(angle.sin());
1666 self.current_step = (self.current_step + 1) % self.num_steps_per_circle;
1667 }
1668 let values = values_builder.finish();
1669 let vectors =
1670 FixedSizeListArray::try_new(self.data_field.clone(), 2, Arc::new(values), None)?;
1671 Ok(Arc::new(vectors))
1672 }
1673
1674 fn data_type(&self) -> &DataType {
1675 &self.data_type
1676 }
1677
1678 fn element_size_bytes(&self) -> Option<ByteCount> {
1679 Some(ByteCount::from(8))
1680 }
1681}
1682
1683#[derive(Debug)]
1685struct JitterCentroidsGenerator {
1686 centroids: Float32Array,
1687 dimension: u32,
1688 noise_level: f32,
1689 data_type: DataType,
1690 data_field: Arc<Field>,
1691
1692 offset: usize,
1693}
1694
1695impl JitterCentroidsGenerator {
1696 fn try_new(centroids: Arc<dyn Array>, noise_level: f32) -> Result<Self, ArrowError> {
1697 let DataType::FixedSizeList(values_field, dimension) = centroids.data_type() else {
1698 return Err(ArrowError::InvalidArgumentError(
1699 "Centroids must be a FixedSizeList".to_string(),
1700 ));
1701 };
1702 if values_field.data_type() != &DataType::Float32 {
1703 return Err(ArrowError::InvalidArgumentError(
1704 "Centroids values must be a Float32".to_string(),
1705 ));
1706 }
1707 let data_type = DataType::FixedSizeList(values_field.clone(), *dimension);
1708 Ok(Self {
1709 centroids: centroids
1710 .as_fixed_size_list()
1711 .values()
1712 .as_primitive::<Float32Type>()
1713 .clone(),
1714 dimension: *dimension as u32,
1715 noise_level,
1716 data_type,
1717 data_field: values_field.clone(),
1718 offset: 0,
1719 })
1720 }
1721}
1722
1723impl ArrayGenerator for JitterCentroidsGenerator {
1724 fn generate(
1725 &mut self,
1726 length: RowCount,
1727 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1728 ) -> Result<Arc<dyn Array>, ArrowError> {
1729 let mut values_builder =
1730 Float32Builder::with_capacity(length.0 as usize * self.dimension as usize);
1731 for _ in 0..length.0 {
1732 let mut noise = (0..self.dimension as usize)
1734 .map(|_| rng.random::<f32>())
1735 .collect::<Vec<_>>();
1736 let scale = self.noise_level / noise.iter().map(|v| v * v).sum::<f32>().sqrt();
1738 noise.iter_mut().for_each(|v| *v *= scale);
1739
1740 for (i, noise) in noise.into_iter().enumerate() {
1742 let centroid_val = self.centroids.value(self.offset + i);
1743 let jittered_val = centroid_val + noise;
1744 values_builder.append_value(jittered_val);
1745 }
1746 self.offset = (self.offset + self.dimension as usize) % self.centroids.len();
1748 }
1749 let values = values_builder.finish();
1750 let vectors = FixedSizeListArray::try_new(
1751 self.data_field.clone(),
1752 self.dimension as i32,
1753 Arc::new(values),
1754 None,
1755 )?;
1756 Ok(Arc::new(vectors))
1757 }
1758
1759 fn data_type(&self) -> &DataType {
1760 &self.data_type
1761 }
1762
1763 fn element_size_bytes(&self) -> Option<ByteCount> {
1764 Some(ByteCount::from(self.dimension as u64 * 4))
1765 }
1766}
1767#[derive(Debug)]
1768struct RandomStructGenerator {
1769 fields: Fields,
1770 data_type: DataType,
1771 child_gens: Vec<Box<dyn ArrayGenerator>>,
1772}
1773
1774impl RandomStructGenerator {
1775 fn new(fields: Fields, child_gens: Vec<Box<dyn ArrayGenerator>>) -> Self {
1776 let data_type = DataType::Struct(fields.clone());
1777 Self {
1778 fields,
1779 data_type,
1780 child_gens,
1781 }
1782 }
1783}
1784
1785impl ArrayGenerator for RandomStructGenerator {
1786 fn generate(
1787 &mut self,
1788 length: RowCount,
1789 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1790 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1791 if self.child_gens.is_empty() {
1792 let struct_arr = StructArray::new_empty_fields(length.0 as usize, None);
1795 return Ok(Arc::new(struct_arr));
1796 }
1797 let child_arrays = self
1798 .child_gens
1799 .iter_mut()
1800 .map(|genn| genn.generate(length, rng))
1801 .collect::<Result<Vec<_>, ArrowError>>()?;
1802 let struct_arr = StructArray::new(self.fields.clone(), child_arrays, None);
1803 Ok(Arc::new(struct_arr))
1804 }
1805
1806 fn data_type(&self) -> &DataType {
1807 &self.data_type
1808 }
1809
1810 fn element_size_bytes(&self) -> Option<ByteCount> {
1811 let mut sum = 0;
1812 for child_gen in &self.child_gens {
1813 sum += child_gen.element_size_bytes()?.0;
1814 }
1815 Some(ByteCount::from(sum))
1816 }
1817}
1818
1819pub struct FixedSizeBatchGenerator {
1821 rng: rand_xoshiro::Xoshiro256PlusPlus,
1822 generators: Vec<Box<dyn ArrayGenerator>>,
1823 batch_size: RowCount,
1824 num_batches: BatchCount,
1825 schema: SchemaRef,
1826}
1827
1828impl FixedSizeBatchGenerator {
1829 fn new(
1830 generators: Vec<(Option<String>, Box<dyn ArrayGenerator>)>,
1831 batch_size: RowCount,
1832 num_batches: BatchCount,
1833 seed: Option<Seed>,
1834 default_null_probability: Option<f64>,
1835 ) -> Self {
1836 let mut fields = Vec::with_capacity(generators.len());
1837 for (field_index, field_gen) in generators.iter().enumerate() {
1838 let (name, genn) = field_gen;
1839 let default_name = format!("field_{}", field_index);
1840 let name = name.clone().unwrap_or(default_name);
1841 let mut field = Field::new(name, genn.data_type().clone(), true);
1842 if let Some(metadata) = genn.metadata() {
1843 field = field.with_metadata(metadata);
1844 }
1845 fields.push(field);
1846 }
1847 let mut generators = generators
1848 .into_iter()
1849 .map(|(_, genn)| genn)
1850 .collect::<Vec<_>>();
1851 if let Some(null_probability) = default_null_probability {
1852 generators = generators
1853 .into_iter()
1854 .map(|genn| genn.with_random_nulls(null_probability))
1855 .collect();
1856 }
1857 let schema = Arc::new(Schema::new(fields));
1858 Self {
1859 rng: rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
1860 seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
1861 ),
1862 generators,
1863 batch_size,
1864 num_batches,
1865 schema,
1866 }
1867 }
1868
1869 fn gen_next(&mut self) -> Result<RecordBatch, ArrowError> {
1870 let mut arrays = Vec::with_capacity(self.generators.len());
1871 for genn in self.generators.iter_mut() {
1872 let arr = genn.generate(self.batch_size, &mut self.rng)?;
1873 arrays.push(arr);
1874 }
1875 self.num_batches.0 -= 1;
1876 Ok(RecordBatch::try_new_with_options(
1877 self.schema.clone(),
1878 arrays,
1879 &RecordBatchOptions::new().with_row_count(Some(self.batch_size.0 as usize)),
1880 )
1881 .unwrap())
1882 }
1883}
1884
1885impl Iterator for FixedSizeBatchGenerator {
1886 type Item = Result<RecordBatch, ArrowError>;
1887
1888 fn next(&mut self) -> Option<Self::Item> {
1889 if self.num_batches.0 == 0 {
1890 return None;
1891 }
1892 Some(self.gen_next())
1893 }
1894}
1895
1896impl RecordBatchReader for FixedSizeBatchGenerator {
1897 fn schema(&self) -> SchemaRef {
1898 self.schema.clone()
1899 }
1900}
1901
1902#[derive(Default)]
1907pub struct BatchGeneratorBuilder {
1908 generators: Vec<(Option<String>, Box<dyn ArrayGenerator>)>,
1909 default_null_probability: Option<f64>,
1910 seed: Option<Seed>,
1911}
1912
1913pub enum RoundingBehavior {
1914 ExactOrErr,
1915 RoundUp,
1916 RoundDown,
1917}
1918
1919impl BatchGeneratorBuilder {
1920 pub fn new() -> Self {
1922 Default::default()
1923 }
1924
1925 pub fn new_with_seed(seed: Seed) -> Self {
1927 Self {
1928 seed: Some(seed),
1929 ..Default::default()
1930 }
1931 }
1932
1933 pub fn col(mut self, name: impl Into<String>, genn: Box<dyn ArrayGenerator>) -> Self {
1937 self.generators.push((Some(name.into()), genn));
1938 self
1939 }
1940
1941 pub fn anon_col(mut self, genn: Box<dyn ArrayGenerator>) -> Self {
1945 self.generators.push((None, genn));
1946 self
1947 }
1948
1949 pub fn into_batch_rows(self, batch_size: RowCount) -> Result<RecordBatch, ArrowError> {
1950 let mut reader = self.into_reader_rows(batch_size, BatchCount::from(1));
1951 reader
1952 .next()
1953 .expect("Asked for 1 batch but reader was empty")
1954 }
1955
1956 pub fn into_batch_bytes(
1957 self,
1958 batch_size: ByteCount,
1959 rounding: RoundingBehavior,
1960 ) -> Result<RecordBatch, ArrowError> {
1961 let mut reader = self.into_reader_bytes(batch_size, BatchCount::from(1), rounding)?;
1962 reader
1963 .next()
1964 .expect("Asked for 1 batch but reader was empty")
1965 }
1966
1967 pub fn into_reader_rows(
1969 self,
1970 batch_size: RowCount,
1971 num_batches: BatchCount,
1972 ) -> impl RecordBatchReader {
1973 FixedSizeBatchGenerator::new(
1974 self.generators,
1975 batch_size,
1976 num_batches,
1977 self.seed,
1978 self.default_null_probability,
1979 )
1980 }
1981
1982 pub fn into_reader_stream(
1983 self,
1984 batch_size: RowCount,
1985 num_batches: BatchCount,
1986 ) -> (
1987 BoxStream<'static, Result<RecordBatch, ArrowError>>,
1988 Arc<Schema>,
1989 ) {
1990 let reader = self.into_reader_rows(batch_size, num_batches);
1992 let schema = reader.schema();
1993 let batches = reader.collect::<Vec<_>>();
1994 (futures::stream::iter(batches).boxed(), schema)
1995 }
1996
1997 pub fn into_reader_bytes(
1999 self,
2000 batch_size_bytes: ByteCount,
2001 num_batches: BatchCount,
2002 rounding: RoundingBehavior,
2003 ) -> Result<impl RecordBatchReader, ArrowError> {
2004 let bytes_per_row = self
2005 .generators
2006 .iter()
2007 .map(|genn| genn.1.element_size_bytes().map(|byte_count| byte_count.0).ok_or(
2008 ArrowError::NotYetImplemented("The function into_reader_bytes currently requires each array generator to have a fixed element size".to_string())
2009 )
2010 )
2011 .sum::<Result<u64, ArrowError>>()?;
2012 let mut num_rows = RowCount::from(batch_size_bytes.0 / bytes_per_row);
2013 if !batch_size_bytes.0.is_multiple_of(bytes_per_row) {
2014 match rounding {
2015 RoundingBehavior::ExactOrErr => {
2016 return Err(ArrowError::NotYetImplemented(
2017 format!("Exact rounding requested but not possible. Batch size requested {}, row size: {}", batch_size_bytes.0, bytes_per_row))
2018 );
2019 }
2020 RoundingBehavior::RoundUp => {
2021 num_rows = RowCount::from(num_rows.0 + 1);
2022 }
2023 RoundingBehavior::RoundDown => (),
2024 }
2025 }
2026 Ok(self.into_reader_rows(num_rows, num_batches))
2027 }
2028
2029 pub fn with_seed(mut self, seed: Seed) -> Self {
2031 self.seed = Some(seed);
2032 self
2033 }
2034
2035 pub fn with_random_nulls(&mut self, default_null_probability: f64) {
2037 self.default_null_probability = Some(default_null_probability);
2038 }
2039}
2040
2041pub struct ArrayGeneratorBuilder {
2043 generator: Box<dyn ArrayGenerator>,
2044 seed: Option<Seed>,
2045}
2046
2047impl ArrayGeneratorBuilder {
2048 fn new(generator: Box<dyn ArrayGenerator>) -> Self {
2049 Self {
2050 generator,
2051 seed: None,
2052 }
2053 }
2054
2055 pub fn with_seed(mut self, seed: Seed) -> Self {
2057 self.seed = Some(seed);
2058 self
2059 }
2060
2061 pub fn into_array_rows(
2063 mut self,
2064 length: RowCount,
2065 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
2066 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
2067 self.seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
2068 );
2069 self.generator.generate(length, &mut rng)
2070 }
2071}
2072
2073const MS_PER_DAY: i64 = 86400000;
2074
2075pub mod array {
2076
2077 use arrow::datatypes::{Int16Type, Int64Type, Int8Type};
2078 use arrow_array::types::{
2079 Decimal128Type, Decimal256Type, DurationMicrosecondType, DurationMillisecondType,
2080 DurationNanosecondType, DurationSecondType, Float16Type, Float32Type, Float64Type,
2081 UInt16Type, UInt32Type, UInt64Type, UInt8Type,
2082 };
2083 use arrow_array::{
2084 ArrowNativeTypeOp, BooleanArray, Date32Array, Date64Array, Time32MillisecondArray,
2085 Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
2086 TimestampMicrosecondArray, TimestampNanosecondArray, TimestampSecondArray,
2087 };
2088 use arrow_schema::{IntervalUnit, TimeUnit};
2089 use chrono::Utc;
2090 use rand::prelude::Distribution;
2091
2092 use super::*;
2093
2094 pub fn cycle_vec(
2099 generator: Box<dyn ArrayGenerator>,
2100 dimension: Dimension,
2101 ) -> Box<dyn ArrayGenerator> {
2102 Box::new(CycleVectorGenerator::new(generator, dimension))
2103 }
2104
2105 pub fn cycle_vec_var(
2110 generator: Box<dyn ArrayGenerator>,
2111 min_list_size: Dimension,
2112 max_list_size: Dimension,
2113 ) -> Box<dyn ArrayGenerator> {
2114 Box::new(CycleListGenerator::new(
2115 generator,
2116 min_list_size,
2117 max_list_size,
2118 ))
2119 }
2120
2121 pub fn cycle_unit_circle(num_steps: u32) -> Box<dyn ArrayGenerator> {
2126 Box::new(RadialStepGenerator::new(num_steps))
2127 }
2128
2129 pub fn jitter_centroids(centroids: Arc<dyn Array>, jitter: f32) -> Box<dyn ArrayGenerator> {
2133 Box::new(JitterCentroidsGenerator::try_new(centroids, jitter).unwrap())
2134 }
2135
2136 pub fn cycle<DataType>(values: Vec<DataType::Native>) -> Box<dyn ArrayGenerator>
2141 where
2142 DataType::Native: Copy + 'static,
2143 DataType: ArrowPrimitiveType,
2144 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2145 {
2146 let mut values_idx = 0;
2147 Box::new(
2148 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2149 DataType::DATA_TYPE,
2150 move |_| {
2151 let y = values[values_idx];
2152 values_idx = (values_idx + 1) % values.len();
2153 y
2154 },
2155 1,
2156 DataType::DATA_TYPE
2157 .primitive_width()
2158 .map(|width| ByteCount::from(width as u64))
2159 .expect("Primitive types should have a fixed width"),
2160 ),
2161 )
2162 }
2163
2164 pub fn cycle_bool(values: Vec<bool>) -> Box<dyn ArrayGenerator> {
2169 let mut values_idx = 0;
2170 Box::new(FnGen::<bool, BooleanArray, _>::new_unknown_size(
2171 DataType::Boolean,
2172 move |_| {
2173 let val = values[values_idx];
2174 values_idx = (values_idx + 1) % values.len();
2175 val
2176 },
2177 1,
2178 ))
2179 }
2180
2181 pub fn step<DataType>() -> Box<dyn ArrayGenerator>
2183 where
2184 DataType::Native: Copy + Default + std::ops::AddAssign<DataType::Native> + 'static,
2185 DataType: ArrowPrimitiveType,
2186 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2187 {
2188 let mut x = DataType::Native::default();
2189 Box::new(
2190 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2191 DataType::DATA_TYPE,
2192 move |_| {
2193 let y = x;
2194 x += DataType::Native::ONE;
2195 y
2196 },
2197 1,
2198 DataType::DATA_TYPE
2199 .primitive_width()
2200 .map(|width| ByteCount::from(width as u64))
2201 .expect("Primitive types should have a fixed width"),
2202 ),
2203 )
2204 }
2205
2206 pub fn blob() -> Box<dyn ArrayGenerator> {
2207 let mut blob_meta = HashMap::new();
2208 blob_meta.insert("lance-encoding:blob".to_string(), "true".to_string());
2209 rand_fixedbin(ByteCount::from(4 * 1024 * 1024), true).with_metadata(blob_meta)
2210 }
2211
2212 pub fn step_custom<DataType>(
2214 start: DataType::Native,
2215 step: DataType::Native,
2216 ) -> Box<dyn ArrayGenerator>
2217 where
2218 DataType::Native: Copy + Default + std::ops::AddAssign<DataType::Native> + 'static,
2219 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2220 DataType: ArrowPrimitiveType,
2221 {
2222 let mut x = start;
2223 Box::new(
2224 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2225 DataType::DATA_TYPE,
2226 move |_| {
2227 let y = x;
2228 x += step;
2229 y
2230 },
2231 1,
2232 DataType::DATA_TYPE
2233 .primitive_width()
2234 .map(|width| ByteCount::from(width as u64))
2235 .expect("Primitive types should have a fixed width"),
2236 ),
2237 )
2238 }
2239
2240 pub fn fill<DataType>(value: DataType::Native) -> Box<dyn ArrayGenerator>
2242 where
2243 DataType::Native: Copy + 'static,
2244 DataType: ArrowPrimitiveType,
2245 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2246 {
2247 Box::new(
2248 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2249 DataType::DATA_TYPE,
2250 move |_| value,
2251 1,
2252 DataType::DATA_TYPE
2253 .primitive_width()
2254 .map(|width| ByteCount::from(width as u64))
2255 .expect("Primitive types should have a fixed width"),
2256 ),
2257 )
2258 }
2259
2260 pub fn fill_varbin(value: Vec<u8>) -> Box<dyn ArrayGenerator> {
2262 Box::new(FixedBinaryGenerator::<BinaryType>::new(value))
2263 }
2264
2265 pub fn fill_utf8(value: String) -> Box<dyn ArrayGenerator> {
2267 Box::new(FixedBinaryGenerator::<Utf8Type>::new(value.into_bytes()))
2268 }
2269
2270 pub fn cycle_utf8_literals(values: &[&'static str]) -> Box<dyn ArrayGenerator> {
2271 Box::new(CycleBinaryGenerator::<Utf8Type>::from_strings(values))
2272 }
2273
2274 pub fn rand<DataType>() -> Box<dyn ArrayGenerator>
2276 where
2277 DataType::Native: Copy + 'static,
2278 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2279 DataType: ArrowPrimitiveType,
2280 rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2281 {
2282 Box::new(
2283 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2284 DataType::DATA_TYPE,
2285 move |rng| rng.random(),
2286 1,
2287 DataType::DATA_TYPE
2288 .primitive_width()
2289 .map(|width| ByteCount::from(width as u64))
2290 .expect("Primitive types should have a fixed width"),
2291 ),
2292 )
2293 }
2294
2295 pub fn rand_with_distribution<
2297 DataType,
2298 Dist: rand::distr::Distribution<DataType::Native> + Clone + Send + Sync + 'static,
2299 >(
2300 dist: Dist,
2301 ) -> Box<dyn ArrayGenerator>
2302 where
2303 DataType::Native: Copy + 'static,
2304 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2305 DataType: ArrowPrimitiveType,
2306 {
2307 Box::new(
2308 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2309 DataType::DATA_TYPE,
2310 move |rng| rng.sample(dist.clone()),
2311 1,
2312 DataType::DATA_TYPE
2313 .primitive_width()
2314 .map(|width| ByteCount::from(width as u64))
2315 .expect("Primitive types should have a fixed width"),
2316 ),
2317 )
2318 }
2319
2320 pub fn rand_vec<DataType>(dimension: Dimension) -> Box<dyn ArrayGenerator>
2322 where
2323 DataType::Native: Copy + 'static,
2324 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2325 DataType: ArrowPrimitiveType,
2326 rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2327 {
2328 let underlying = rand::<DataType>();
2329 cycle_vec(underlying, dimension)
2330 }
2331
2332 pub fn rand_vec_nullable<DataType>(
2334 dimension: Dimension,
2335 null_probability: f64,
2336 ) -> Box<dyn ArrayGenerator>
2337 where
2338 DataType::Native: Copy + 'static,
2339 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2340 DataType: ArrowPrimitiveType,
2341 rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2342 {
2343 let underlying = rand::<DataType>().with_random_nulls(null_probability);
2344 cycle_vec(underlying, dimension)
2345 }
2346
2347 pub fn rand_time32(resolution: &TimeUnit) -> Box<dyn ArrayGenerator> {
2350 let start = 0;
2351 let end = match resolution {
2352 TimeUnit::Second => 86_400,
2353 TimeUnit::Millisecond => 86_400_000,
2354 _ => panic!(),
2355 };
2356
2357 let data_type = DataType::Time32(*resolution);
2358 let size = ByteCount::from(data_type.primitive_width().unwrap() as u64);
2359 let dist = Uniform::new(start, end).unwrap();
2360 let sample_fn = move |rng: &mut _| dist.sample(rng);
2361
2362 match resolution {
2363 TimeUnit::Second => Box::new(FnGen::<i32, Time32SecondArray, _>::new_known_size(
2364 data_type, sample_fn, 1, size,
2365 )),
2366 TimeUnit::Millisecond => {
2367 Box::new(FnGen::<i32, Time32MillisecondArray, _>::new_known_size(
2368 data_type, sample_fn, 1, size,
2369 ))
2370 }
2371 _ => panic!(),
2372 }
2373 }
2374
2375 pub fn rand_time64(resolution: &TimeUnit) -> Box<dyn ArrayGenerator> {
2378 let start = 0_i64;
2379 let end: i64 = match resolution {
2380 TimeUnit::Microsecond => 86_400_000,
2381 TimeUnit::Nanosecond => 86_400_000_000,
2382 _ => panic!(),
2383 };
2384
2385 let data_type = DataType::Time64(*resolution);
2386 let size = ByteCount::from(data_type.primitive_width().unwrap() as u64);
2387 let dist = Uniform::new(start, end).unwrap();
2388 let sample_fn = move |rng: &mut _| dist.sample(rng);
2389
2390 match resolution {
2391 TimeUnit::Microsecond => {
2392 Box::new(FnGen::<i64, Time64MicrosecondArray, _>::new_known_size(
2393 data_type, sample_fn, 1, size,
2394 ))
2395 }
2396 TimeUnit::Nanosecond => {
2397 Box::new(FnGen::<i64, Time64NanosecondArray, _>::new_known_size(
2398 data_type, sample_fn, 1, size,
2399 ))
2400 }
2401 _ => panic!(),
2402 }
2403 }
2404
2405 pub fn rand_pseudo_uuid() -> Box<dyn ArrayGenerator> {
2411 Box::<PseudoUuidGenerator>::default()
2412 }
2413
2414 pub fn rand_pseudo_uuid_hex() -> Box<dyn ArrayGenerator> {
2421 Box::<PseudoUuidHexGenerator>::default()
2422 }
2423
2424 pub fn rand_primitive<T: ArrowPrimitiveType + Send + Sync>(
2425 data_type: DataType,
2426 ) -> Box<dyn ArrayGenerator> {
2427 Box::new(RandomBytesGenerator::<T>::new(data_type))
2428 }
2429
2430 pub fn rand_fsb(size: i32) -> Box<dyn ArrayGenerator> {
2431 Box::new(RandomFixedSizeBinaryGenerator::new(size))
2432 }
2433
2434 pub fn rand_interval(unit: IntervalUnit) -> Box<dyn ArrayGenerator> {
2435 Box::new(RandomIntervalGenerator::new(unit))
2436 }
2437
2438 pub fn rand_date32() -> Box<dyn ArrayGenerator> {
2443 let now = chrono::Utc::now();
2444 let one_year_ago = now - chrono::TimeDelta::try_days(365).expect("TimeDelta try days");
2445 rand_date32_in_range(one_year_ago, now)
2446 }
2447
2448 pub fn rand_date32_in_range(
2450 start: chrono::DateTime<Utc>,
2451 end: chrono::DateTime<Utc>,
2452 ) -> Box<dyn ArrayGenerator> {
2453 let data_type = DataType::Date32;
2454 let end_ms = end.timestamp_millis();
2455 let end_days = (end_ms / MS_PER_DAY) as i32;
2456 let start_ms = start.timestamp_millis();
2457 let start_days = (start_ms / MS_PER_DAY) as i32;
2458 let dist = Uniform::new(start_days, end_days).unwrap();
2459
2460 Box::new(FnGen::<i32, Date32Array, _>::new_known_size(
2461 data_type,
2462 move |rng| dist.sample(rng),
2463 1,
2464 DataType::Date32
2465 .primitive_width()
2466 .map(|width| ByteCount::from(width as u64))
2467 .expect("Date32 should have a fixed width"),
2468 ))
2469 }
2470
2471 pub fn rand_date64() -> Box<dyn ArrayGenerator> {
2476 let now = chrono::Utc::now();
2477 let one_year_ago = now - chrono::TimeDelta::try_days(365).expect("TimeDelta try_days");
2478 rand_date64_in_range(one_year_ago, now)
2479 }
2480
2481 pub fn rand_timestamp_in_range(
2485 start: chrono::DateTime<Utc>,
2486 end: chrono::DateTime<Utc>,
2487 data_type: &DataType,
2488 ) -> Box<dyn ArrayGenerator> {
2489 let end_ms = end.timestamp_millis();
2490 let start_ms = start.timestamp_millis();
2491 let (start_ticks, end_ticks) = match data_type {
2492 DataType::Timestamp(TimeUnit::Nanosecond, _) => {
2493 (start_ms * 1000 * 1000, end_ms * 1000 * 1000)
2494 }
2495 DataType::Timestamp(TimeUnit::Microsecond, _) => (start_ms * 1000, end_ms * 1000),
2496 DataType::Timestamp(TimeUnit::Millisecond, _) => (start_ms, end_ms),
2497 DataType::Timestamp(TimeUnit::Second, _) => (start.timestamp(), end.timestamp()),
2498 _ => panic!(),
2499 };
2500 let dist = Uniform::new(start_ticks, end_ticks).unwrap();
2501
2502 let data_type = data_type.clone();
2503 let sample_fn = move |rng: &mut _| dist.sample(rng);
2504 let width = data_type
2505 .primitive_width()
2506 .map(|width| ByteCount::from(width as u64))
2507 .unwrap();
2508
2509 match data_type {
2510 DataType::Timestamp(TimeUnit::Nanosecond, _) => {
2511 Box::new(FnGen::<i64, TimestampNanosecondArray, _>::new_known_size(
2512 data_type, sample_fn, 1, width,
2513 ))
2514 }
2515 DataType::Timestamp(TimeUnit::Microsecond, _) => {
2516 Box::new(FnGen::<i64, TimestampMicrosecondArray, _>::new_known_size(
2517 data_type, sample_fn, 1, width,
2518 ))
2519 }
2520 DataType::Timestamp(TimeUnit::Millisecond, _) => {
2521 Box::new(FnGen::<i64, TimestampMicrosecondArray, _>::new_known_size(
2522 data_type, sample_fn, 1, width,
2523 ))
2524 }
2525 DataType::Timestamp(TimeUnit::Second, _) => {
2526 Box::new(FnGen::<i64, TimestampSecondArray, _>::new_known_size(
2527 data_type, sample_fn, 1, width,
2528 ))
2529 }
2530 _ => panic!(),
2531 }
2532 }
2533
2534 pub fn rand_timestamp(data_type: &DataType) -> Box<dyn ArrayGenerator> {
2535 let now = chrono::Utc::now();
2536 let one_year_ago = now - chrono::Duration::try_days(365).unwrap();
2537 rand_timestamp_in_range(one_year_ago, now, data_type)
2538 }
2539
2540 pub fn rand_date64_in_range(
2545 start: chrono::DateTime<Utc>,
2546 end: chrono::DateTime<Utc>,
2547 ) -> Box<dyn ArrayGenerator> {
2548 let data_type = DataType::Date64;
2549 let end_ms = end.timestamp_millis();
2550 let end_days = end_ms / MS_PER_DAY;
2551 let start_ms = start.timestamp_millis();
2552 let start_days = start_ms / MS_PER_DAY;
2553 let dist = Uniform::new(start_days, end_days).unwrap();
2554
2555 Box::new(FnGen::<i64, Date64Array, _>::new_known_size(
2556 data_type,
2557 move |rng| (dist.sample(rng)) * MS_PER_DAY,
2558 1,
2559 DataType::Date64
2560 .primitive_width()
2561 .map(|width| ByteCount::from(width as u64))
2562 .expect("Date64 should have a fixed width"),
2563 ))
2564 }
2565
2566 pub fn rand_fixedbin(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
2568 Box::new(RandomBinaryGenerator::new(
2569 bytes_per_element,
2570 false,
2571 is_large,
2572 ))
2573 }
2574
2575 pub fn rand_varbin(
2579 min_bytes_per_element: ByteCount,
2580 max_bytes_per_element: ByteCount,
2581 ) -> Box<dyn ArrayGenerator> {
2582 Box::new(VariableRandomBinaryGenerator::new(
2583 min_bytes_per_element,
2584 max_bytes_per_element,
2585 ))
2586 }
2587
2588 pub fn rand_utf8(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
2592 Box::new(RandomBinaryGenerator::new(
2593 bytes_per_element,
2594 true,
2595 is_large,
2596 ))
2597 }
2598
2599 pub fn utf8_prefix_plus_counter(
2603 prefix: impl Into<String>,
2604 is_large: bool,
2605 ) -> Box<dyn ArrayGenerator> {
2606 Box::new(PrefixPlusCounterGenerator::new(prefix.into(), is_large))
2607 }
2608
2609 pub fn binary_prefix_plus_counter(
2610 prefix: Arc<[u8]>,
2611 is_large: bool,
2612 ) -> Box<dyn ArrayGenerator> {
2613 Box::new(BinaryPrefixPlusCounterGenerator::new(prefix, is_large))
2614 }
2615
2616 pub fn rand_boolean() -> Box<dyn ArrayGenerator> {
2618 Box::<RandomBooleanGenerator>::default()
2619 }
2620
2621 pub fn random_sentence(
2625 min_words: usize,
2626 max_words: usize,
2627 is_large: bool,
2628 ) -> Box<dyn ArrayGenerator> {
2629 Box::new(RandomSentenceGenerator::new(min_words, max_words, is_large))
2630 }
2631
2632 pub fn random_word(is_large: bool) -> Box<dyn ArrayGenerator> {
2636 Box::new(RandomWordGenerator::new(is_large))
2637 }
2638
2639 pub fn rand_list(item_type: &DataType, is_large: bool) -> Box<dyn ArrayGenerator> {
2640 let child_gen = rand_type(item_type);
2641 Box::new(RandomListGenerator::new(child_gen, is_large))
2642 }
2643
2644 pub fn rand_list_any(
2645 item_gen: Box<dyn ArrayGenerator>,
2646 is_large: bool,
2647 ) -> Box<dyn ArrayGenerator> {
2648 Box::new(RandomListGenerator::new(item_gen, is_large))
2649 }
2650
2651 pub fn rand_struct(fields: Fields) -> Box<dyn ArrayGenerator> {
2652 let child_gens = fields
2653 .iter()
2654 .map(|f| rand_type(f.data_type()))
2655 .collect::<Vec<_>>();
2656 Box::new(RandomStructGenerator::new(fields, child_gens))
2657 }
2658
2659 pub fn null_type() -> Box<dyn ArrayGenerator> {
2660 Box::new(NullArrayGenerator {})
2661 }
2662
2663 pub fn rand_type(data_type: &DataType) -> Box<dyn ArrayGenerator> {
2665 match data_type {
2666 DataType::Boolean => rand_boolean(),
2667 DataType::Int8 => rand::<Int8Type>(),
2668 DataType::Int16 => rand::<Int16Type>(),
2669 DataType::Int32 => rand::<Int32Type>(),
2670 DataType::Int64 => rand::<Int64Type>(),
2671 DataType::UInt8 => rand::<UInt8Type>(),
2672 DataType::UInt16 => rand::<UInt16Type>(),
2673 DataType::UInt32 => rand::<UInt32Type>(),
2674 DataType::UInt64 => rand::<UInt64Type>(),
2675 DataType::Float16 => rand_primitive::<Float16Type>(data_type.clone()),
2676 DataType::Float32 => rand::<Float32Type>(),
2677 DataType::Float64 => rand::<Float64Type>(),
2678 DataType::Decimal128(_, _) => rand_primitive::<Decimal128Type>(data_type.clone()),
2679 DataType::Decimal256(_, _) => rand_primitive::<Decimal256Type>(data_type.clone()),
2680 DataType::Utf8 => rand_utf8(ByteCount::from(12), false),
2681 DataType::LargeUtf8 => rand_utf8(ByteCount::from(12), true),
2682 DataType::Binary => rand_fixedbin(ByteCount::from(12), false),
2683 DataType::LargeBinary => rand_fixedbin(ByteCount::from(12), true),
2684 DataType::Dictionary(key_type, value_type) => {
2685 dict_type(rand_type(value_type), key_type)
2686 }
2687 DataType::FixedSizeList(child, dimension) => cycle_vec(
2688 rand_type(child.data_type()),
2689 Dimension::from(*dimension as u32),
2690 ),
2691 DataType::FixedSizeBinary(size) => rand_fsb(*size),
2692 DataType::List(child) => rand_list(child.data_type(), false),
2693 DataType::LargeList(child) => rand_list(child.data_type(), true),
2694 DataType::Duration(unit) => match unit {
2695 TimeUnit::Second => rand::<DurationSecondType>(),
2696 TimeUnit::Millisecond => rand::<DurationMillisecondType>(),
2697 TimeUnit::Microsecond => rand::<DurationMicrosecondType>(),
2698 TimeUnit::Nanosecond => rand::<DurationNanosecondType>(),
2699 },
2700 DataType::Interval(unit) => rand_interval(*unit),
2701 DataType::Date32 => rand_date32(),
2702 DataType::Date64 => rand_date64(),
2703 DataType::Time32(resolution) => rand_time32(resolution),
2704 DataType::Time64(resolution) => rand_time64(resolution),
2705 DataType::Timestamp(_, _) => rand_timestamp(data_type),
2706 DataType::Struct(fields) => rand_struct(fields.clone()),
2707 DataType::Null => null_type(),
2708 _ => unimplemented!("random generation of {}", data_type),
2709 }
2710 }
2711
2712 pub fn dict<K: ArrowDictionaryKeyType + Send + Sync>(
2718 generator: Box<dyn ArrayGenerator>,
2719 ) -> Box<dyn ArrayGenerator> {
2720 Box::new(DictionaryGenerator::<K>::new(generator))
2721 }
2722
2723 pub fn dict_type(
2725 generator: Box<dyn ArrayGenerator>,
2726 key_type: &DataType,
2727 ) -> Box<dyn ArrayGenerator> {
2728 match key_type {
2729 DataType::Int8 => dict::<Int8Type>(generator),
2730 DataType::Int16 => dict::<Int16Type>(generator),
2731 DataType::Int32 => dict::<Int32Type>(generator),
2732 DataType::Int64 => dict::<Int64Type>(generator),
2733 DataType::UInt8 => dict::<UInt8Type>(generator),
2734 DataType::UInt16 => dict::<UInt16Type>(generator),
2735 DataType::UInt32 => dict::<UInt32Type>(generator),
2736 DataType::UInt64 => dict::<UInt64Type>(generator),
2737 _ => unimplemented!(),
2738 }
2739 }
2740}
2741
2742pub fn gen_batch() -> BatchGeneratorBuilder {
2744 BatchGeneratorBuilder::default()
2745}
2746
2747pub fn gen_array(genn: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
2749 ArrayGeneratorBuilder::new(genn)
2750}
2751
2752pub fn rand(schema: &Schema) -> BatchGeneratorBuilder {
2756 let mut builder = BatchGeneratorBuilder::default();
2757 for field in schema.fields() {
2758 builder = builder.col(field.name(), array::rand_type(field.data_type()));
2759 }
2760 builder
2761}
2762
2763#[cfg(test)]
2764mod tests {
2765
2766 use arrow::datatypes::{Float32Type, Int16Type, Int8Type, UInt32Type};
2767 use arrow_array::{BooleanArray, Float32Array, Int16Array, Int32Array, Int8Array, UInt32Array};
2768
2769 use super::*;
2770
2771 #[test]
2772 fn test_step() {
2773 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2774 let mut genn = array::step::<Int32Type>();
2775 assert_eq!(
2776 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2777 Int32Array::from_iter([0, 1, 2, 3, 4])
2778 );
2779 assert_eq!(
2780 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2781 Int32Array::from_iter([5, 6, 7, 8, 9])
2782 );
2783
2784 let mut genn = array::step::<Int8Type>();
2785 assert_eq!(
2786 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2787 Int8Array::from_iter([0, 1, 2])
2788 );
2789
2790 let mut genn = array::step::<Float32Type>();
2791 assert_eq!(
2792 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2793 Float32Array::from_iter([0.0, 1.0, 2.0])
2794 );
2795
2796 let mut genn = array::step_custom::<Int16Type>(4, 8);
2797 assert_eq!(
2798 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2799 Int16Array::from_iter([4, 12, 20])
2800 );
2801 assert_eq!(
2802 *genn.generate(RowCount::from(2), &mut rng).unwrap(),
2803 Int16Array::from_iter([28, 36])
2804 );
2805 }
2806
2807 #[test]
2808 fn test_cycle() {
2809 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2810 let mut genn = array::cycle::<Int32Type>(vec![1, 2, 3]);
2811 assert_eq!(
2812 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2813 Int32Array::from_iter([1, 2, 3, 1, 2])
2814 );
2815
2816 let mut genn = array::cycle_utf8_literals(&["abc", "def", "xyz"]);
2817 assert_eq!(
2818 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2819 StringArray::from_iter_values(["abc", "def", "xyz", "abc", "def"])
2820 );
2821 assert_eq!(
2822 *genn.generate(RowCount::from(1), &mut rng).unwrap(),
2823 StringArray::from_iter_values(["xyz"])
2824 );
2825
2826 let mut genn = array::cycle_bool(vec![false, false, true]);
2827 assert_eq!(
2828 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
2829 BooleanArray::from_iter(vec![false, false, true, false, false].into_iter().map(Some))
2830 );
2831 assert_eq!(
2832 *genn.generate(RowCount::from(1), &mut rng).unwrap(),
2833 BooleanArray::from_iter(vec![Some(true)])
2834 )
2835 }
2836
2837 #[test]
2838 fn test_fill() {
2839 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2840 let mut genn = array::fill::<Int32Type>(42);
2841 assert_eq!(
2842 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2843 Int32Array::from_iter([42, 42, 42])
2844 );
2845 assert_eq!(
2846 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2847 Int32Array::from_iter([42, 42, 42])
2848 );
2849
2850 let mut genn = array::fill_varbin(vec![0, 1, 2]);
2851 assert_eq!(
2852 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2853 arrow_array::BinaryArray::from_iter_values([
2854 "\x00\x01\x02",
2855 "\x00\x01\x02",
2856 "\x00\x01\x02"
2857 ])
2858 );
2859
2860 let mut genn = array::fill_utf8("xyz".to_string());
2861 assert_eq!(
2862 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2863 arrow_array::StringArray::from_iter_values(["xyz", "xyz", "xyz"])
2864 );
2865 }
2866
2867 #[test]
2868 fn test_utf8_prefix_plus_counter() {
2869 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2870 let mut genn = array::utf8_prefix_plus_counter("user_", false);
2871 assert_eq!(
2872 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2873 arrow_array::StringArray::from_iter_values(["user_0", "user_1", "user_2"])
2874 );
2875 }
2876
2877 #[test]
2878 fn test_rng() {
2879 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2881 let mut genn = array::rand::<Int32Type>();
2882 assert_eq!(
2883 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2884 Int32Array::from_iter([-797553329, 1369325940, -69174021])
2885 );
2886
2887 let mut genn = array::rand_fixedbin(ByteCount::from(3), false);
2888 assert_eq!(
2889 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2890 arrow_array::BinaryArray::from_iter_values([
2891 [184, 53, 216],
2892 [12, 96, 159],
2893 [125, 179, 56]
2894 ])
2895 );
2896
2897 let mut genn = array::rand_utf8(ByteCount::from(3), false);
2898 assert_eq!(
2899 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2900 arrow_array::StringArray::from_iter_values([">@p", "n `", "NWa"])
2901 );
2902
2903 let mut genn = array::random_sentence(1, 5, false);
2904 let words = genn.generate(RowCount::from(10), &mut rng).unwrap();
2905 assert_eq!(words.data_type(), &DataType::Utf8);
2906 let words_array = words.as_any().downcast_ref::<StringArray>().unwrap();
2907 for i in 0..10 {
2909 let sentence = words_array.value(i);
2910 let word_count = sentence.split_whitespace().count();
2911 assert!((1..=5).contains(&word_count));
2912 }
2913
2914 let mut genn = array::rand_date32();
2915 let days_32 = genn.generate(RowCount::from(3), &mut rng).unwrap();
2916 assert_eq!(days_32.data_type(), &DataType::Date32);
2917
2918 let mut genn = array::rand_date64();
2919 let days_64 = genn.generate(RowCount::from(3), &mut rng).unwrap();
2920 assert_eq!(days_64.data_type(), &DataType::Date64);
2921
2922 let mut genn = array::rand_boolean();
2923 let bools = genn.generate(RowCount::from(1024), &mut rng).unwrap();
2924 assert_eq!(bools.data_type(), &DataType::Boolean);
2925 let bools = bools.as_any().downcast_ref::<BooleanArray>().unwrap();
2926 assert!(bools.false_count() > 100);
2928 assert!(bools.true_count() > 100);
2929
2930 let mut genn = array::rand_varbin(ByteCount::from(2), ByteCount::from(4));
2931 assert_eq!(
2932 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
2933 arrow_array::BinaryArray::from_iter_values([
2934 vec![234, 107],
2935 vec![220, 152],
2936 vec![21, 16, 184, 220]
2937 ])
2938 );
2939 }
2940
2941 #[test]
2942 fn test_rng_list() {
2943 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2945 let mut genn = array::rand_list(&DataType::Int32, false);
2946 let arr = genn.generate(RowCount::from(100), &mut rng).unwrap();
2947 let arr = arr.as_list::<i32>();
2949 assert!(arr.iter().any(|l| l.unwrap().is_empty()));
2950 assert!(arr.iter().any(|l| l.unwrap().len() < 11));
2952 }
2953
2954 #[test]
2955 fn test_rng_distribution() {
2956 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2960 let mut genn = array::rand::<UInt32Type>();
2961 for _ in 0..10 {
2962 let arr = genn.generate(RowCount::from(10000), &mut rng).unwrap();
2963 let int_arr = arr.as_any().downcast_ref::<UInt32Array>().unwrap();
2964 let mut buckets = vec![0_u32; 256];
2965 for val in int_arr.values() {
2966 buckets[(*val >> 24) as usize] += 1;
2967 }
2968 for bucket in buckets {
2969 assert!(bucket > 15);
2972 }
2973 }
2974 }
2975
2976 #[test]
2977 fn test_nulls() {
2978 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
2979 let mut genn = array::rand::<Int32Type>().with_random_nulls(0.3);
2980
2981 let arr = genn.generate(RowCount::from(1000), &mut rng).unwrap();
2982
2983 assert_eq!(arr.null_count(), 297);
2985
2986 for len in 0..100 {
2987 let arr = genn.generate(RowCount::from(len), &mut rng).unwrap();
2988 assert_eq!(
2990 arr.null_count(),
2991 arr.nulls()
2992 .map(|nulls| (len as usize)
2993 - nulls.buffer().count_set_bits_offset(0, len as usize))
2994 .unwrap_or(0)
2995 );
2996 }
2997
2998 let mut genn = array::rand::<Int32Type>().with_random_nulls(0.0);
2999 let arr = genn.generate(RowCount::from(10), &mut rng).unwrap();
3000
3001 assert_eq!(arr.null_count(), 0);
3002
3003 let mut genn = array::rand::<Int32Type>().with_random_nulls(1.0);
3004 let arr = genn.generate(RowCount::from(10), &mut rng).unwrap();
3005
3006 assert_eq!(arr.null_count(), 10);
3007 assert!((0..10).all(|idx| arr.is_null(idx)));
3008
3009 let mut genn = array::rand::<Int32Type>().with_nulls(&[false, false, true]);
3010 let arr = genn.generate(RowCount::from(7), &mut rng).unwrap();
3011 assert!((0..2).all(|idx| arr.is_valid(idx)));
3012 assert!(arr.is_null(2));
3013 assert!((3..5).all(|idx| arr.is_valid(idx)));
3014 assert!(arr.is_null(5));
3015 assert!(arr.is_valid(6));
3016 }
3017
3018 #[test]
3019 fn test_unit_circle() {
3020 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3021 let mut genn = array::cycle_unit_circle(4);
3022 let arr = genn.generate(RowCount::from(6), &mut rng).unwrap();
3023
3024 let arr_values = arr
3025 .as_fixed_size_list()
3026 .values()
3027 .as_primitive::<Float32Type>()
3028 .values()
3029 .to_vec();
3030 assert_eq!(arr_values.len(), 12);
3031 let expected_values = [1.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0, -1.0, 1.0, 0.0, 0.0, 1.0];
3032 for (actual, expected) in arr_values.iter().zip(expected_values.iter()) {
3033 assert!((actual - expected).abs() < 0.0001);
3034 }
3035 }
3036
3037 #[test]
3038 fn test_jitter_centroids() {
3039 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3040 let mut centroids_gen = array::cycle_unit_circle(4);
3041 let centroids = centroids_gen.generate(RowCount::from(4), &mut rng).unwrap();
3042
3043 let centroid_values = centroids
3044 .as_fixed_size_list()
3045 .values()
3046 .as_primitive::<Float32Type>()
3047 .values()
3048 .to_vec();
3049
3050 let mut jitter_jen = array::jitter_centroids(centroids, 0.001);
3051 let jittered = jitter_jen.generate(RowCount::from(100), &mut rng).unwrap();
3052
3053 let values = jittered
3054 .as_fixed_size_list()
3055 .values()
3056 .as_primitive::<Float32Type>()
3057 .values()
3058 .to_vec();
3059
3060 for i in 0..100 {
3061 let centroid = i % 4;
3062 let centroid_x = centroid_values[centroid * 2];
3063 let centroid_y = centroid_values[centroid * 2 + 1];
3064 let value_x = values[i * 2];
3065 let value_y = values[i * 2 + 1];
3066
3067 let l2_dist = ((value_x - centroid_x).powi(2) + (value_y - centroid_y).powi(2)).sqrt();
3068 assert!(l2_dist < 0.001001);
3069 assert!(l2_dist > 0.000999);
3070 }
3071 }
3072
3073 #[test]
3074 fn test_rand_schema() {
3075 let schema = Schema::new(vec![
3076 Field::new("a", DataType::Int32, true),
3077 Field::new("b", DataType::Utf8, true),
3078 Field::new("c", DataType::Float32, true),
3079 Field::new("d", DataType::Int32, true),
3080 Field::new("e", DataType::Int32, true),
3081 ]);
3082 let rbr = rand(&schema)
3083 .into_reader_bytes(
3084 ByteCount::from(1024 * 1024),
3085 BatchCount::from(8),
3086 RoundingBehavior::ExactOrErr,
3087 )
3088 .unwrap();
3089 assert_eq!(*rbr.schema(), schema);
3090
3091 let batches = rbr.map(|val| val.unwrap()).collect::<Vec<_>>();
3092 assert_eq!(batches.len(), 8);
3093
3094 for batch in batches {
3095 assert_eq!(batch.num_rows(), 1024 * 1024 / 32);
3096 assert_eq!(batch.num_columns(), 5);
3097 }
3098 }
3099}