1use std::{collections::HashMap, iter, marker::PhantomData, sync::Arc, sync::LazyLock};
5
6use arrow::{
7 array::{ArrayData, AsArray, Float32Builder, GenericBinaryBuilder, GenericStringBuilder},
8 buffer::{BooleanBuffer, Buffer, OffsetBuffer, ScalarBuffer},
9 datatypes::{
10 ArrowPrimitiveType, Float32Type, Int32Type, Int64Type, IntervalDayTime,
11 IntervalMonthDayNano, UInt32Type,
12 },
13};
14use arrow_array::{
15 Array, BinaryArray, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, LargeListArray,
16 LargeStringArray, ListArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, RecordBatch,
17 RecordBatchOptions, RecordBatchReader, StringArray, StructArray, make_array,
18 types::{ArrowDictionaryKeyType, BinaryType, ByteArrayType, Utf8Type},
19};
20use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef};
21use futures::{StreamExt, stream::BoxStream};
22use rand::{Rng, RngCore, SeedableRng, distr::Uniform};
23use rand_distr::Zipf;
24
25use self::array::rand_with_distribution;
26
27#[derive(Copy, Clone, Debug, Default)]
28pub struct RowCount(u64);
29#[derive(Copy, Clone, Debug, Default)]
30pub struct BatchCount(u32);
31#[derive(Copy, Clone, Debug, Default)]
32pub struct ByteCount(u64);
33#[derive(Copy, Clone, Debug, Default)]
34pub struct Dimension(u32);
35
36impl From<u32> for BatchCount {
37 fn from(n: u32) -> Self {
38 Self(n)
39 }
40}
41
42impl From<u64> for RowCount {
43 fn from(n: u64) -> Self {
44 Self(n)
45 }
46}
47
48impl From<u64> for ByteCount {
49 fn from(n: u64) -> Self {
50 Self(n)
51 }
52}
53
54impl From<u32> for Dimension {
55 fn from(n: u32) -> Self {
56 Self(n)
57 }
58}
59
60pub trait ArrayGenerator: Send + Sync + std::fmt::Debug {
62 fn generate(
76 &mut self,
77 length: RowCount,
78 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
79 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError>;
80
81 fn generate_default(
91 &mut self,
92 length: RowCount,
93 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
94 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
95 Self::generate(self, length, &mut rng)
96 }
97 fn data_type(&self) -> &DataType;
103 fn metadata(&self) -> Option<HashMap<String, String>> {
105 None
106 }
107 fn element_size_bytes(&self) -> Option<ByteCount>;
113}
114
115#[derive(Debug)]
116pub struct CycleNullGenerator {
117 generator: Box<dyn ArrayGenerator>,
118 validity: Vec<bool>,
119 idx: usize,
120}
121#[derive(Debug)]
122pub struct CycleNanGenerator {
123 generator: Box<dyn ArrayGenerator>,
124 nan_pattern: Vec<bool>,
125 idx: usize,
126}
127
128impl ArrayGenerator for CycleNanGenerator {
129 fn generate(
130 &mut self,
131 length: RowCount,
132 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
133 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
134 let array = self.generator.generate(length, rng)?;
135
136 match array.data_type() {
138 DataType::Float16 => {
139 let float_array = array
140 .as_any()
141 .downcast_ref::<arrow_array::Float16Array>()
142 .unwrap();
143 let mut values: Vec<half::f16> = float_array.values().to_vec();
144
145 for (i, &should_be_nan) in self
146 .nan_pattern
147 .iter()
148 .cycle()
149 .skip(self.idx)
150 .take(length.0 as usize)
151 .enumerate()
152 {
153 if should_be_nan {
154 values[i] = half::f16::NAN;
155 }
156 }
157
158 self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
159 Ok(Arc::new(arrow_array::Float16Array::from(values)))
160 }
161 DataType::Float32 => {
162 let float_array = array
163 .as_any()
164 .downcast_ref::<arrow_array::Float32Array>()
165 .unwrap();
166 let mut values: Vec<f32> = float_array.values().to_vec();
167
168 for (i, &should_be_nan) in self
169 .nan_pattern
170 .iter()
171 .cycle()
172 .skip(self.idx)
173 .take(length.0 as usize)
174 .enumerate()
175 {
176 if should_be_nan {
177 values[i] = f32::NAN;
178 }
179 }
180
181 self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
182 Ok(Arc::new(arrow_array::Float32Array::from(values)))
183 }
184 DataType::Float64 => {
185 let float_array = array
186 .as_any()
187 .downcast_ref::<arrow_array::Float64Array>()
188 .unwrap();
189 let mut values: Vec<f64> = float_array.values().to_vec();
190
191 for (i, &should_be_nan) in self
192 .nan_pattern
193 .iter()
194 .cycle()
195 .skip(self.idx)
196 .take(length.0 as usize)
197 .enumerate()
198 {
199 if should_be_nan {
200 values[i] = f64::NAN;
201 }
202 }
203
204 self.idx = (self.idx + (length.0 as usize)) % self.nan_pattern.len();
205 Ok(Arc::new(arrow_array::Float64Array::from(values)))
206 }
207 _ => {
208 Ok(array)
210 }
211 }
212 }
213
214 fn data_type(&self) -> &DataType {
215 self.generator.data_type()
216 }
217
218 fn element_size_bytes(&self) -> Option<ByteCount> {
219 self.generator.element_size_bytes()
220 }
221}
222
223impl ArrayGenerator for CycleNullGenerator {
224 fn generate(
225 &mut self,
226 length: RowCount,
227 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
228 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
229 let array = self.generator.generate(length, rng)?;
230 let data = array.to_data();
231 let validity_itr = self
232 .validity
233 .iter()
234 .cycle()
235 .skip(self.idx)
236 .take(length.0 as usize)
237 .copied();
238 let validity_bitmap = BooleanBuffer::from_iter(validity_itr);
239
240 self.idx = (self.idx + (length.0 as usize)) % self.validity.len();
241 unsafe {
242 let new_data = ArrayData::new_unchecked(
243 data.data_type().clone(),
244 data.len(),
245 None,
246 Some(validity_bitmap.into_inner()),
247 data.offset(),
248 data.buffers().to_vec(),
249 data.child_data().into(),
250 );
251 Ok(make_array(new_data))
252 }
253 }
254
255 fn data_type(&self) -> &DataType {
256 self.generator.data_type()
257 }
258
259 fn element_size_bytes(&self) -> Option<ByteCount> {
260 self.generator.element_size_bytes()
261 }
262}
263
264#[derive(Debug)]
265pub struct MetadataGenerator {
266 generator: Box<dyn ArrayGenerator>,
267 metadata: HashMap<String, String>,
268}
269
270impl ArrayGenerator for MetadataGenerator {
271 fn generate(
272 &mut self,
273 length: RowCount,
274 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
275 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
276 self.generator.generate(length, rng)
277 }
278
279 fn metadata(&self) -> Option<HashMap<String, String>> {
280 Some(self.metadata.clone())
281 }
282
283 fn data_type(&self) -> &DataType {
284 self.generator.data_type()
285 }
286
287 fn element_size_bytes(&self) -> Option<ByteCount> {
288 self.generator.element_size_bytes()
289 }
290}
291
292#[derive(Debug)]
293pub struct NullGenerator {
294 generator: Box<dyn ArrayGenerator>,
295 null_probability: f64,
296}
297
298impl ArrayGenerator for NullGenerator {
299 fn generate(
300 &mut self,
301 length: RowCount,
302 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
303 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
304 let array = self.generator.generate(length, rng)?;
305 let data = array.to_data();
306
307 if self.null_probability < 0.0 || self.null_probability > 1.0 {
308 return Err(ArrowError::InvalidArgumentError(format!(
309 "null_probability must be between 0 and 1, got {}",
310 self.null_probability
311 )));
312 }
313
314 let (null_count, new_validity) = if self.null_probability == 0.0 {
315 if data.null_count() == 0 {
316 return Ok(array);
317 } else {
318 (0_usize, None)
319 }
320 } else if self.null_probability == 1.0 {
321 if data.null_count() == data.len() {
322 return Ok(array);
323 } else {
324 let all_nulls = BooleanBuffer::new_unset(array.len());
325 (array.len(), Some(all_nulls.into_inner()))
326 }
327 } else {
328 let array_len = array.len();
329 let num_validity_bytes = array_len.div_ceil(8);
330 let mut null_count = 0;
331 let threshold = (self.null_probability * u8::MAX as f64) as u8;
334 let bytes = (0..num_validity_bytes)
335 .map(|byte_idx| {
336 let mut sample = rng.random::<u64>();
337 let mut byte: u8 = 0;
338 for bit_idx in 0..8 {
339 byte <<= 1;
342 let pos = byte_idx * 8 + (7 - bit_idx);
343 if pos < array_len {
344 let sample_piece = sample & 0xFF;
345 let is_null = (sample_piece as u8) < threshold;
346 byte |= (!is_null) as u8;
347 null_count += is_null as usize;
348 }
349 sample >>= 8;
350 }
351 byte
352 })
353 .collect::<Vec<_>>();
354 let new_validity = Buffer::from_iter(bytes);
355 (null_count, Some(new_validity))
356 };
357
358 unsafe {
359 let new_data = ArrayData::new_unchecked(
360 data.data_type().clone(),
361 data.len(),
362 Some(null_count),
363 new_validity,
364 data.offset(),
365 data.buffers().to_vec(),
366 data.child_data().into(),
367 );
368 Ok(make_array(new_data))
369 }
370 }
371
372 fn metadata(&self) -> Option<HashMap<String, String>> {
373 self.generator.metadata()
374 }
375
376 fn data_type(&self) -> &DataType {
377 self.generator.data_type()
378 }
379
380 fn element_size_bytes(&self) -> Option<ByteCount> {
381 self.generator.element_size_bytes()
382 }
383}
384
385pub trait ArrayGeneratorExt {
386 fn with_random_nulls(self, null_probability: f64) -> Box<dyn ArrayGenerator>;
388 fn with_nulls(self, nulls: &[bool]) -> Box<dyn ArrayGenerator>;
390 fn with_nans(self, nans: &[bool]) -> Box<dyn ArrayGenerator>;
394 fn with_validity(self, nulls: &[bool]) -> Box<dyn ArrayGenerator>;
396 fn with_metadata(self, metadata: HashMap<String, String>) -> Box<dyn ArrayGenerator>;
397}
398
399impl ArrayGeneratorExt for Box<dyn ArrayGenerator> {
400 fn with_random_nulls(self, null_probability: f64) -> Box<dyn ArrayGenerator> {
401 Box::new(NullGenerator {
402 generator: self,
403 null_probability,
404 })
405 }
406
407 fn with_nulls(self, nulls: &[bool]) -> Box<dyn ArrayGenerator> {
408 Box::new(CycleNullGenerator {
409 generator: self,
410 validity: nulls.iter().map(|v| !*v).collect(),
411 idx: 0,
412 })
413 }
414
415 fn with_nans(self, nans: &[bool]) -> Box<dyn ArrayGenerator> {
416 Box::new(CycleNanGenerator {
417 generator: self,
418 nan_pattern: nans.to_vec(),
419 idx: 0,
420 })
421 }
422
423 fn with_validity(self, validity: &[bool]) -> Box<dyn ArrayGenerator> {
424 Box::new(CycleNullGenerator {
425 generator: self,
426 validity: validity.to_vec(),
427 idx: 0,
428 })
429 }
430
431 fn with_metadata(self, metadata: HashMap<String, String>) -> Box<dyn ArrayGenerator> {
432 Box::new(MetadataGenerator {
433 generator: self,
434 metadata,
435 })
436 }
437}
438
439pub struct NTimesIter<I: Iterator>
440where
441 I::Item: Copy,
442{
443 iter: I,
444 n: u32,
445 cur: I::Item,
446 count: u32,
447}
448
449impl<I: Iterator> Iterator for NTimesIter<I>
455where
456 I::Item: Copy,
457{
458 type Item = I::Item;
459
460 fn next(&mut self) -> Option<Self::Item> {
461 if self.count == 0 {
462 self.count = self.n - 1;
463 self.cur = self.iter.next()?;
464 } else {
465 self.count -= 1;
466 }
467 Some(self.cur)
468 }
469
470 fn size_hint(&self) -> (usize, Option<usize>) {
471 let (lower, upper) = self.iter.size_hint();
472 let lower = lower * self.n as usize;
473 let upper = upper.map(|u| u * self.n as usize);
474 (lower, upper)
475 }
476}
477
478pub struct FnGen<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T>
479where
480 T: Copy + Default,
481 ArrayType: arrow_array::Array + From<Vec<T>>,
482{
483 data_type: DataType,
484 generator: F,
485 array_type: PhantomData<ArrayType>,
486 repeat: u32,
487 leftover: T,
488 leftover_count: u32,
489 element_size_bytes: Option<ByteCount>,
490}
491
492impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> std::fmt::Debug
493 for FnGen<T, ArrayType, F>
494where
495 T: Copy + Default,
496 ArrayType: arrow_array::Array + From<Vec<T>>,
497{
498 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
499 f.debug_struct("FnGen")
500 .field("data_type", &self.data_type)
501 .field("array_type", &self.array_type)
502 .field("repeat", &self.repeat)
503 .field("leftover_count", &self.leftover_count)
504 .field("element_size_bytes", &self.element_size_bytes)
505 .finish()
506 }
507}
508
509impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> FnGen<T, ArrayType, F>
510where
511 T: Copy + Default,
512 ArrayType: arrow_array::Array + From<Vec<T>>,
513{
514 fn new_known_size(
515 data_type: DataType,
516 generator: F,
517 repeat: u32,
518 element_size_bytes: ByteCount,
519 ) -> Self {
520 Self {
521 data_type,
522 generator,
523 array_type: PhantomData,
524 repeat,
525 leftover: T::default(),
526 leftover_count: 0,
527 element_size_bytes: Some(element_size_bytes),
528 }
529 }
530
531 fn new_unknown_size(data_type: DataType, generator: F, repeat: u32) -> Self {
532 Self {
533 data_type,
534 generator,
535 array_type: PhantomData,
536 repeat,
537 leftover: T::default(),
538 leftover_count: 0,
539 element_size_bytes: None,
540 }
541 }
542}
543
544impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> ArrayGenerator
545 for FnGen<T, ArrayType, F>
546where
547 T: Copy + Default + Send + Sync,
548 ArrayType: arrow_array::Array + From<Vec<T>> + 'static,
549 F: Send + Sync,
550{
551 fn generate(
552 &mut self,
553 length: RowCount,
554 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
555 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
556 let iter = (0..length.0).map(|_| (self.generator)(rng));
557 let values = if self.repeat > 1 {
558 Vec::from_iter(
559 NTimesIter {
560 iter,
561 n: self.repeat,
562 cur: self.leftover,
563 count: self.leftover_count,
564 }
565 .take(length.0 as usize),
566 )
567 } else {
568 Vec::from_iter(iter)
569 };
570 self.leftover_count = ((self.leftover_count as u64 + length.0) % self.repeat as u64) as u32;
571 self.leftover = values.last().copied().unwrap_or(T::default());
572 Ok(Arc::new(ArrayType::from(values)))
573 }
574
575 fn data_type(&self) -> &DataType {
576 &self.data_type
577 }
578
579 fn element_size_bytes(&self) -> Option<ByteCount> {
580 self.element_size_bytes
581 }
582}
583
584#[derive(Copy, Clone, Debug)]
585pub struct Seed(pub u64);
586pub const DEFAULT_SEED: Seed = Seed(42);
587
588impl From<u64> for Seed {
589 fn from(n: u64) -> Self {
590 Self(n)
591 }
592}
593
594#[derive(Debug)]
595pub struct CycleVectorGenerator {
596 underlying_gen: Box<dyn ArrayGenerator>,
597 dimension: Dimension,
598 data_type: DataType,
599}
600
601impl CycleVectorGenerator {
602 pub fn new(underlying_gen: Box<dyn ArrayGenerator>, dimension: Dimension) -> Self {
603 let data_type = DataType::FixedSizeList(
604 Arc::new(Field::new("item", underlying_gen.data_type().clone(), true)),
605 dimension.0 as i32,
606 );
607 Self {
608 underlying_gen,
609 dimension,
610 data_type,
611 }
612 }
613}
614
615impl ArrayGenerator for CycleVectorGenerator {
616 fn generate(
617 &mut self,
618 length: RowCount,
619 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
620 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
621 let values = self
622 .underlying_gen
623 .generate(RowCount::from(length.0 * self.dimension.0 as u64), rng)?;
624 let field = Arc::new(Field::new("item", values.data_type().clone(), true));
625 let values = Arc::new(values);
626
627 let array = FixedSizeListArray::try_new(field, self.dimension.0 as i32, values, None)?;
628
629 Ok(Arc::new(array))
630 }
631
632 fn data_type(&self) -> &DataType {
633 &self.data_type
634 }
635
636 fn element_size_bytes(&self) -> Option<ByteCount> {
637 self.underlying_gen
638 .element_size_bytes()
639 .map(|byte_count| ByteCount::from(byte_count.0 * self.dimension.0 as u64))
640 }
641}
642
643#[derive(Debug)]
644pub struct CycleListGenerator {
645 underlying_gen: Box<dyn ArrayGenerator>,
646 lengths_gen: Box<dyn ArrayGenerator>,
647 data_type: DataType,
648}
649
650impl CycleListGenerator {
651 pub fn new(
652 underlying_gen: Box<dyn ArrayGenerator>,
653 min_list_size: Dimension,
654 max_list_size: Dimension,
655 ) -> Self {
656 let data_type = DataType::List(Arc::new(Field::new(
657 "item",
658 underlying_gen.data_type().clone(),
659 true,
660 )));
661 let lengths_dist = Uniform::new(min_list_size.0, max_list_size.0).unwrap();
662 let lengths_gen = rand_with_distribution::<UInt32Type, Uniform<u32>>(lengths_dist);
663 Self {
664 underlying_gen,
665 lengths_gen,
666 data_type,
667 }
668 }
669}
670
671impl ArrayGenerator for CycleListGenerator {
672 fn generate(
673 &mut self,
674 length: RowCount,
675 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
676 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
677 let lengths = self.lengths_gen.generate(length, rng)?;
678 let lengths = lengths.as_primitive::<UInt32Type>();
679 let total_length = lengths.values().iter().map(|i| *i as u64).sum::<u64>();
680 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
681 let values = self
682 .underlying_gen
683 .generate(RowCount::from(total_length), rng)?;
684 let field = Arc::new(Field::new("item", values.data_type().clone(), true));
685 let values = Arc::new(values);
686
687 let array = ListArray::try_new(field, offsets, values, None)?;
688
689 Ok(Arc::new(array))
690 }
691
692 fn data_type(&self) -> &DataType {
693 &self.data_type
694 }
695
696 fn element_size_bytes(&self) -> Option<ByteCount> {
697 None
698 }
699}
700
701#[derive(Debug, Default)]
702pub struct PseudoUuidGenerator {}
703
704impl ArrayGenerator for PseudoUuidGenerator {
705 fn generate(
706 &mut self,
707 length: RowCount,
708 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
709 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
710 Ok(Arc::new(FixedSizeBinaryArray::try_from_iter(
711 (0..length.0).map(|_| {
712 let mut data = vec![0; 16];
713 rng.fill_bytes(&mut data);
714 data
715 }),
716 )?))
717 }
718
719 fn data_type(&self) -> &DataType {
720 &DataType::FixedSizeBinary(16)
721 }
722
723 fn element_size_bytes(&self) -> Option<ByteCount> {
724 Some(ByteCount::from(16))
725 }
726}
727
728#[derive(Debug, Default)]
729pub struct PseudoUuidHexGenerator {}
730
731impl ArrayGenerator for PseudoUuidHexGenerator {
732 fn generate(
733 &mut self,
734 length: RowCount,
735 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
736 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
737 let mut data = vec![0; 16 * length.0 as usize];
738 rng.fill_bytes(&mut data);
739 let data_hex = hex::encode(data);
740
741 Ok(Arc::new(StringArray::from_iter_values(
742 (0..length.0 as usize).map(|i| data_hex.get(i * 32..(i + 1) * 32).unwrap()),
743 )))
744 }
745
746 fn data_type(&self) -> &DataType {
747 &DataType::Utf8
748 }
749
750 fn element_size_bytes(&self) -> Option<ByteCount> {
751 Some(ByteCount::from(16))
752 }
753}
754
755#[derive(Debug, Default)]
756pub struct RandomBooleanGenerator {}
757
758impl ArrayGenerator for RandomBooleanGenerator {
759 fn generate(
760 &mut self,
761 length: RowCount,
762 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
763 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
764 let num_bytes = length.0.div_ceil(8);
765 let mut bytes = vec![0; num_bytes as usize];
766 rng.fill_bytes(&mut bytes);
767 let bytes = BooleanBuffer::new(Buffer::from(bytes), 0, length.0 as usize);
768 Ok(Arc::new(arrow_array::BooleanArray::new(bytes, None)))
769 }
770
771 fn data_type(&self) -> &DataType {
772 &DataType::Boolean
773 }
774
775 fn element_size_bytes(&self) -> Option<ByteCount> {
776 None
779 }
780}
781
782pub struct RandomBytesGenerator<T: ArrowPrimitiveType + Send + Sync> {
785 phantom: PhantomData<T>,
786 data_type: DataType,
787}
788
789impl<T: ArrowPrimitiveType + Send + Sync> std::fmt::Debug for RandomBytesGenerator<T> {
790 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
791 f.debug_struct("RandomBytesGenerator")
792 .field("data_type", &self.data_type)
793 .finish()
794 }
795}
796
797impl<T: ArrowPrimitiveType + Send + Sync> RandomBytesGenerator<T> {
798 fn new(data_type: DataType) -> Self {
799 Self {
800 phantom: Default::default(),
801 data_type,
802 }
803 }
804
805 fn byte_width() -> Result<u64, ArrowError> {
806 T::DATA_TYPE.primitive_width().ok_or_else(|| ArrowError::InvalidArgumentError(format!("Cannot generate the data type {} with the RandomBytesGenerator because it is not a fixed-width bytes type", T::DATA_TYPE))).map(|val| val as u64)
807 }
808}
809
810impl<T: ArrowPrimitiveType + Send + Sync> ArrayGenerator for RandomBytesGenerator<T> {
811 fn generate(
812 &mut self,
813 length: RowCount,
814 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
815 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
816 let num_bytes = length.0 * Self::byte_width()?;
817 let mut bytes = vec![0; num_bytes as usize];
818 rng.fill_bytes(&mut bytes);
819 let bytes = ScalarBuffer::new(Buffer::from(bytes), 0, length.0 as usize);
820 Ok(Arc::new(
821 PrimitiveArray::<T>::new(bytes, None).with_data_type(self.data_type.clone()),
822 ))
823 }
824
825 fn data_type(&self) -> &DataType {
826 &self.data_type
827 }
828
829 fn element_size_bytes(&self) -> Option<ByteCount> {
830 Self::byte_width().map(ByteCount::from).ok()
831 }
832}
833
834#[derive(Debug)]
837pub struct RandomFixedSizeBinaryGenerator {
838 data_type: DataType,
839 size: i32,
840}
841
842impl RandomFixedSizeBinaryGenerator {
843 fn new(size: i32) -> Self {
844 Self {
845 size,
846 data_type: DataType::FixedSizeBinary(size),
847 }
848 }
849}
850
851impl ArrayGenerator for RandomFixedSizeBinaryGenerator {
852 fn generate(
853 &mut self,
854 length: RowCount,
855 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
856 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
857 let num_bytes = length.0 * self.size as u64;
858 let mut bytes = vec![0; num_bytes as usize];
859 rng.fill_bytes(&mut bytes);
860 Ok(Arc::new(FixedSizeBinaryArray::new(
861 self.size,
862 Buffer::from(bytes),
863 None,
864 )))
865 }
866
867 fn data_type(&self) -> &DataType {
868 &self.data_type
869 }
870
871 fn element_size_bytes(&self) -> Option<ByteCount> {
872 Some(ByteCount::from(self.size as u64))
873 }
874}
875
876#[derive(Debug)]
877pub struct RandomIntervalGenerator {
878 unit: IntervalUnit,
879 data_type: DataType,
880}
881
882impl RandomIntervalGenerator {
883 pub fn new(unit: IntervalUnit) -> Self {
884 Self {
885 unit,
886 data_type: DataType::Interval(unit),
887 }
888 }
889}
890
891impl ArrayGenerator for RandomIntervalGenerator {
892 fn generate(
893 &mut self,
894 length: RowCount,
895 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
896 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
897 match self.unit {
898 IntervalUnit::YearMonth => {
899 let months = (0..length.0)
900 .map(|_| rng.random::<i32>())
901 .collect::<Vec<_>>();
902 Ok(Arc::new(arrow_array::IntervalYearMonthArray::from(months)))
903 }
904 IntervalUnit::MonthDayNano => {
905 let day_time_array = (0..length.0)
906 .map(|_| IntervalMonthDayNano::new(rng.random(), rng.random(), rng.random()))
907 .collect::<Vec<_>>();
908 Ok(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(
909 day_time_array,
910 )))
911 }
912 IntervalUnit::DayTime => {
913 let day_time_array = (0..length.0)
914 .map(|_| IntervalDayTime::new(rng.random(), rng.random()))
915 .collect::<Vec<_>>();
916 Ok(Arc::new(arrow_array::IntervalDayTimeArray::from(
917 day_time_array,
918 )))
919 }
920 }
921 }
922
923 fn data_type(&self) -> &DataType {
924 &self.data_type
925 }
926
927 fn element_size_bytes(&self) -> Option<ByteCount> {
928 Some(ByteCount::from(12))
929 }
930}
931#[derive(Debug)]
932pub struct RandomBinaryGenerator {
933 bytes_per_element: ByteCount,
934 scale_to_utf8: bool,
935 is_large: bool,
936 data_type: DataType,
937}
938
939impl RandomBinaryGenerator {
940 pub fn new(bytes_per_element: ByteCount, scale_to_utf8: bool, is_large: bool) -> Self {
941 Self {
942 bytes_per_element,
943 scale_to_utf8,
944 is_large,
945 data_type: match (scale_to_utf8, is_large) {
946 (false, false) => DataType::Binary,
947 (false, true) => DataType::LargeBinary,
948 (true, false) => DataType::Utf8,
949 (true, true) => DataType::LargeUtf8,
950 },
951 }
952 }
953}
954
955impl ArrayGenerator for RandomBinaryGenerator {
956 fn generate(
957 &mut self,
958 length: RowCount,
959 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
960 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
961 let mut bytes = vec![0; (self.bytes_per_element.0 * length.0) as usize];
962 rng.fill_bytes(&mut bytes);
963 if self.scale_to_utf8 {
964 bytes = bytes.into_iter().map(|val| (val % 95) + 32).collect();
967 }
968 let bytes = Buffer::from(bytes);
969 if self.is_large {
970 let offsets = OffsetBuffer::from_lengths(iter::repeat_n(
971 self.bytes_per_element.0 as usize,
972 length.0 as usize,
973 ));
974 if self.scale_to_utf8 {
975 unsafe {
977 Ok(Arc::new(arrow_array::LargeStringArray::new_unchecked(
978 offsets, bytes, None,
979 )))
980 }
981 } else {
982 unsafe {
983 Ok(Arc::new(arrow_array::LargeBinaryArray::new_unchecked(
984 offsets, bytes, None,
985 )))
986 }
987 }
988 } else {
989 let offsets = OffsetBuffer::from_lengths(iter::repeat_n(
990 self.bytes_per_element.0 as usize,
991 length.0 as usize,
992 ));
993 if self.scale_to_utf8 {
994 unsafe {
996 Ok(Arc::new(arrow_array::StringArray::new_unchecked(
997 offsets, bytes, None,
998 )))
999 }
1000 } else {
1001 unsafe {
1002 Ok(Arc::new(arrow_array::BinaryArray::new_unchecked(
1003 offsets, bytes, None,
1004 )))
1005 }
1006 }
1007 }
1008 }
1009
1010 fn data_type(&self) -> &DataType {
1011 &self.data_type
1012 }
1013
1014 fn element_size_bytes(&self) -> Option<ByteCount> {
1015 Some(ByteCount::from(
1017 self.bytes_per_element.0 + std::mem::size_of::<i32>() as u64,
1018 ))
1019 }
1020}
1021
1022#[derive(Debug)]
1026pub struct PrefixPlusCounterGenerator {
1027 prefix: String,
1028 is_large: bool,
1029 data_type: DataType,
1030 current_counter: u64,
1031}
1032
1033impl PrefixPlusCounterGenerator {
1034 pub fn new(prefix: String, is_large: bool) -> Self {
1035 Self {
1036 prefix,
1037 is_large,
1038 data_type: if is_large {
1039 DataType::LargeUtf8
1040 } else {
1041 DataType::Utf8
1042 },
1043 current_counter: 0,
1044 }
1045 }
1046
1047 fn generate_values<T: OffsetSizeTrait>(
1048 &self,
1049 start: u64,
1050 num_values: u64,
1051 ) -> Result<Arc<dyn Array>, ArrowError> {
1052 let max_counter = start + num_values;
1053 let max_digits_per_counter = (max_counter as f64).log10().ceil() as u64;
1054 let max_bytes_per_str = max_digits_per_counter + self.prefix.len() as u64;
1055 let max_bytes = max_bytes_per_str * num_values;
1056 let mut builder =
1057 GenericStringBuilder::<T>::with_capacity(num_values as usize, max_bytes as usize);
1058 let mut word = String::with_capacity(max_bytes_per_str as usize);
1059 word.push_str(&self.prefix);
1060 for i in 0..num_values {
1061 let counter = start + i;
1062 word.truncate(self.prefix.len());
1063 word.push_str(&counter.to_string());
1064 builder.append_value(&word);
1065 }
1066 Ok(Arc::new(builder.finish()))
1067 }
1068}
1069
1070impl ArrayGenerator for PrefixPlusCounterGenerator {
1071 fn generate(
1072 &mut self,
1073 length: RowCount,
1074 _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1075 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1076 let start = self.current_counter;
1077 self.current_counter += length.0;
1078 if self.is_large {
1079 self.generate_values::<i64>(start, length.0)
1080 } else {
1081 self.generate_values::<i32>(start, length.0)
1082 }
1083 }
1084
1085 fn data_type(&self) -> &DataType {
1086 &self.data_type
1087 }
1088
1089 fn element_size_bytes(&self) -> Option<ByteCount> {
1090 None
1092 }
1093}
1094
1095#[derive(Debug)]
1101pub struct BinaryPrefixPlusCounterGenerator {
1102 prefix: Arc<[u8]>,
1103 is_large: bool,
1104 data_type: DataType,
1105 current_counter: u64,
1106}
1107
1108impl BinaryPrefixPlusCounterGenerator {
1109 pub fn new(prefix: Arc<[u8]>, is_large: bool) -> Self {
1110 Self {
1111 prefix,
1112 is_large,
1113 data_type: if is_large {
1114 DataType::LargeBinary
1115 } else {
1116 DataType::Binary
1117 },
1118 current_counter: 0,
1119 }
1120 }
1121
1122 fn generate_values<T: OffsetSizeTrait>(
1123 &self,
1124 start: u64,
1125 num_values: u64,
1126 ) -> Result<Arc<dyn Array>, ArrowError> {
1127 let max_bytes = (self.prefix.len() + std::mem::size_of::<u64>()) * num_values as usize;
1128 let mut builder = GenericBinaryBuilder::<T>::with_capacity(num_values as usize, max_bytes);
1129 let mut word = Vec::with_capacity(self.prefix.len() + std::mem::size_of::<u64>());
1130 word.extend_from_slice(&self.prefix);
1131 for i in 0..num_values {
1132 let counter = start + i;
1133 word.truncate(self.prefix.len());
1134 if counter < u8::MAX as u64 {
1135 word.push(counter as u8);
1136 } else if counter < u16::MAX as u64 {
1137 word.extend_from_slice(&(counter as u16).to_le_bytes());
1138 } else if counter < u32::MAX as u64 {
1139 word.extend_from_slice(&(counter as u32).to_le_bytes());
1140 } else {
1141 word.extend_from_slice(&counter.to_le_bytes());
1142 }
1143 builder.append_value(&word);
1144 }
1145 Ok(Arc::new(builder.finish()))
1146 }
1147}
1148
1149impl ArrayGenerator for BinaryPrefixPlusCounterGenerator {
1150 fn generate(
1151 &mut self,
1152 length: RowCount,
1153 _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1154 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1155 let start = self.current_counter;
1156 self.current_counter += length.0;
1157 if self.is_large {
1158 self.generate_values::<i64>(start, length.0)
1159 } else {
1160 self.generate_values::<i32>(start, length.0)
1161 }
1162 }
1163
1164 fn data_type(&self) -> &DataType {
1165 &self.data_type
1166 }
1167
1168 fn element_size_bytes(&self) -> Option<ByteCount> {
1169 None
1171 }
1172}
1173
1174const STOP_WORDS: &[&str] = &[
1176 "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
1177 "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
1178 "they", "this", "to", "was", "will", "with",
1179];
1180
1181const ENGLISH_WORDS: &[&str] = &[
1182 "ability",
1183 "able",
1184 "about",
1185 "above",
1186 "accept",
1187 "access",
1188 "account",
1189 "across",
1190 "action",
1191 "active",
1192 "activity",
1193 "actual",
1194 "address",
1195 "adjust",
1196 "admin",
1197 "advance",
1198 "agent",
1199 "align",
1200 "allow",
1201 "amount",
1202 "analysis",
1203 "answer",
1204 "application",
1205 "archive",
1206 "array",
1207 "asset",
1208 "async",
1209 "attribute",
1210 "available",
1211 "balance",
1212 "batch",
1213 "binary",
1214 "bitmap",
1215 "block",
1216 "branch",
1217 "buffer",
1218 "build",
1219 "cache",
1220 "capacity",
1221 "catalog",
1222 "change",
1223 "chunk",
1224 "client",
1225 "cluster",
1226 "column",
1227 "commit",
1228 "common",
1229 "compare",
1230 "compile",
1231 "compute",
1232 "condition",
1233 "config",
1234 "connect",
1235 "content",
1236 "context",
1237 "control",
1238 "convert",
1239 "copy",
1240 "core",
1241 "count",
1242 "create",
1243 "current",
1244 "cursor",
1245 "data",
1246 "dataset",
1247 "decode",
1248 "default",
1249 "delete",
1250 "delta",
1251 "depend",
1252 "derive",
1253 "design",
1254 "detail",
1255 "detect",
1256 "device",
1257 "direct",
1258 "display",
1259 "document",
1260 "domain",
1261 "drive",
1262 "dynamic",
1263 "encode",
1264 "engine",
1265 "error",
1266 "event",
1267 "example",
1268 "execute",
1269 "expand",
1270 "expect",
1271 "export",
1272 "extend",
1273 "feature",
1274 "field",
1275 "filter",
1276 "final",
1277 "finish",
1278 "format",
1279 "fragment",
1280 "future",
1281 "generate",
1282 "global",
1283 "group",
1284 "handle",
1285 "header",
1286 "index",
1287 "input",
1288 "insert",
1289 "inspect",
1290 "instance",
1291 "integer",
1292 "internal",
1293 "item",
1294 "join",
1295 "kernel",
1296 "large",
1297 "layer",
1298 "layout",
1299 "length",
1300 "level",
1301 "limit",
1302 "linear",
1303 "local",
1304 "logical",
1305 "lookup",
1306 "manage",
1307 "manifest",
1308 "memory",
1309 "merge",
1310 "metric",
1311 "model",
1312 "module",
1313 "namespace",
1314 "native",
1315 "node",
1316 "normal",
1317 "number",
1318 "object",
1319 "offset",
1320 "option",
1321 "output",
1322 "package",
1323 "page",
1324 "parallel",
1325 "parse",
1326 "partition",
1327 "pattern",
1328 "physical",
1329 "plan",
1330 "policy",
1331 "prefix",
1332 "prepare",
1333 "primary",
1334 "process",
1335 "profile",
1336 "project",
1337 "property",
1338 "query",
1339 "range",
1340 "reader",
1341 "record",
1342 "region",
1343 "registry",
1344 "request",
1345 "resolve",
1346 "resource",
1347 "result",
1348 "return",
1349 "row",
1350 "runtime",
1351 "scalar",
1352 "scan",
1353 "schema",
1354 "search",
1355 "segment",
1356 "select",
1357 "session",
1358 "setting",
1359 "source",
1360 "stable",
1361 "stage",
1362 "state",
1363 "static",
1364 "storage",
1365 "stream",
1366 "string",
1367 "struct",
1368 "table",
1369 "target",
1370 "task",
1371 "thread",
1372 "token",
1373 "trace",
1374 "transform",
1375 "type",
1376 "update",
1377 "upload",
1378 "value",
1379 "vector",
1380 "version",
1381 "view",
1382 "write",
1383 "writer",
1384];
1385
1386static SENTENCE_WORDS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
1388 let mut words = Vec::with_capacity(STOP_WORDS.len() + ENGLISH_WORDS.len());
1389 words.extend(STOP_WORDS.iter().copied());
1390 words.extend(ENGLISH_WORDS.iter().copied());
1391 words
1392});
1393
1394struct RandomSentenceGenerator {
1395 min_words: usize,
1396 max_words: usize,
1397 zipf: Zipf<f64>,
1399 is_large: bool,
1400}
1401
1402impl std::fmt::Debug for RandomSentenceGenerator {
1403 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1404 f.debug_struct("RandomSentenceGenerator")
1405 .field("min_words", &self.min_words)
1406 .field("max_words", &self.max_words)
1407 .field("num_words", &SENTENCE_WORDS.len())
1408 .field("is_large", &self.is_large)
1409 .finish()
1410 }
1411}
1412
1413impl RandomSentenceGenerator {
1414 pub fn new(min_words: usize, max_words: usize, is_large: bool) -> Self {
1415 let zipf = Zipf::new(SENTENCE_WORDS.len() as f64, 1.0).unwrap();
1417
1418 Self {
1419 min_words,
1420 max_words,
1421 zipf,
1422 is_large,
1423 }
1424 }
1425}
1426
1427impl ArrayGenerator for RandomSentenceGenerator {
1428 fn generate(
1429 &mut self,
1430 length: RowCount,
1431 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1432 ) -> Result<Arc<dyn Array>, ArrowError> {
1433 let mut values = Vec::with_capacity(length.0 as usize);
1434
1435 for _ in 0..length.0 {
1436 let num_words = rng.random_range(self.min_words..=self.max_words);
1437 let sentence: String = (0..num_words)
1438 .map(|_| {
1439 let idx = rng.sample(self.zipf) as usize - 1;
1441 SENTENCE_WORDS[idx]
1442 })
1443 .collect::<Vec<_>>()
1444 .join(" ");
1445 values.push(sentence);
1446 }
1447
1448 if self.is_large {
1449 Ok(Arc::new(LargeStringArray::from(values)))
1450 } else {
1451 Ok(Arc::new(StringArray::from(values)))
1452 }
1453 }
1454
1455 fn data_type(&self) -> &DataType {
1456 if self.is_large {
1457 &DataType::LargeUtf8
1458 } else {
1459 &DataType::Utf8
1460 }
1461 }
1462
1463 fn element_size_bytes(&self) -> Option<ByteCount> {
1464 let avg_word_length = 6;
1467 let avg_words = (self.min_words + self.max_words) / 2;
1468 Some(ByteCount::from((avg_word_length * avg_words) as u64))
1469 }
1470}
1471
1472#[derive(Debug)]
1473struct RandomWordGenerator {
1474 words: &'static [&'static str],
1475 is_large: bool,
1476}
1477
1478impl RandomWordGenerator {
1479 pub fn new(is_large: bool) -> Self {
1480 let words = ENGLISH_WORDS;
1481 Self { words, is_large }
1482 }
1483}
1484
1485impl ArrayGenerator for RandomWordGenerator {
1486 fn generate(
1487 &mut self,
1488 length: RowCount,
1489 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1490 ) -> Result<Arc<dyn Array>, ArrowError> {
1491 let mut values = Vec::with_capacity(length.0 as usize);
1492
1493 for _ in 0..length.0 {
1494 let word = self.words[rng.random_range(0..self.words.len())];
1495 values.push(word.to_string());
1496 }
1497
1498 if self.is_large {
1499 Ok(Arc::new(LargeStringArray::from(values)))
1500 } else {
1501 Ok(Arc::new(StringArray::from(values)))
1502 }
1503 }
1504
1505 fn data_type(&self) -> &DataType {
1506 if self.is_large {
1507 &DataType::LargeUtf8
1508 } else {
1509 &DataType::Utf8
1510 }
1511 }
1512
1513 fn element_size_bytes(&self) -> Option<ByteCount> {
1514 Some(ByteCount::from(5))
1516 }
1517}
1518
1519#[derive(Debug)]
1520pub struct VariableRandomBinaryGenerator {
1521 lengths_gen: Box<dyn ArrayGenerator>,
1522 data_type: DataType,
1523}
1524
1525impl VariableRandomBinaryGenerator {
1526 pub fn new(min_bytes_per_element: ByteCount, max_bytes_per_element: ByteCount) -> Self {
1527 let lengths_dist = Uniform::new_inclusive(
1528 min_bytes_per_element.0 as i32,
1529 max_bytes_per_element.0 as i32,
1530 )
1531 .unwrap();
1532 let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist);
1533
1534 Self {
1535 lengths_gen,
1536 data_type: DataType::Binary,
1537 }
1538 }
1539}
1540
1541impl ArrayGenerator for VariableRandomBinaryGenerator {
1542 fn generate(
1543 &mut self,
1544 length: RowCount,
1545 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1546 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1547 let lengths = self.lengths_gen.generate(length, rng)?;
1548 let lengths = lengths.as_primitive::<Int32Type>();
1549 let total_length = lengths.values().iter().map(|i| *i as usize).sum::<usize>();
1550 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1551 let mut bytes = vec![0; total_length];
1552 rng.fill_bytes(&mut bytes);
1553 let bytes = Buffer::from(bytes);
1554 Ok(Arc::new(BinaryArray::try_new(offsets, bytes, None)?))
1555 }
1556
1557 fn data_type(&self) -> &DataType {
1558 &self.data_type
1559 }
1560
1561 fn element_size_bytes(&self) -> Option<ByteCount> {
1562 None
1563 }
1564}
1565
1566pub struct CycleBinaryGenerator<T: ByteArrayType> {
1567 values: Vec<u8>,
1568 lengths: Vec<usize>,
1569 data_type: DataType,
1570 array_type: PhantomData<T>,
1571 width: Option<ByteCount>,
1572 idx: usize,
1573}
1574
1575impl<T: ByteArrayType> std::fmt::Debug for CycleBinaryGenerator<T> {
1576 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1577 f.debug_struct("CycleBinaryGenerator")
1578 .field("values", &self.values)
1579 .field("lengths", &self.lengths)
1580 .field("data_type", &self.data_type)
1581 .field("width", &self.width)
1582 .field("idx", &self.idx)
1583 .finish()
1584 }
1585}
1586
1587impl<T: ByteArrayType> CycleBinaryGenerator<T> {
1588 pub fn from_strings(values: &[&str]) -> Self {
1589 if values.is_empty() {
1590 panic!("Attempt to create a cycle generator with no values");
1591 }
1592 let lengths = values.iter().map(|s| s.len()).collect::<Vec<_>>();
1593 let typical_length = lengths[0];
1594 let width = if lengths.iter().all(|item| *item == typical_length) {
1595 Some(ByteCount::from(
1596 typical_length as u64 + std::mem::size_of::<i32>() as u64,
1597 ))
1598 } else {
1599 None
1600 };
1601 let values = values
1602 .iter()
1603 .flat_map(|s| s.as_bytes().iter().copied())
1604 .collect::<Vec<_>>();
1605 Self {
1606 values,
1607 lengths,
1608 data_type: T::DATA_TYPE,
1609 array_type: PhantomData,
1610 width,
1611 idx: 0,
1612 }
1613 }
1614}
1615
1616impl<T: ByteArrayType> ArrayGenerator for CycleBinaryGenerator<T> {
1617 fn generate(
1618 &mut self,
1619 length: RowCount,
1620 _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1621 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1622 let lengths = self
1623 .lengths
1624 .iter()
1625 .copied()
1626 .cycle()
1627 .skip(self.idx)
1628 .take(length.0 as usize);
1629 let num_bytes = lengths.clone().sum();
1630 let byte_offset = self.lengths[0..self.idx].iter().sum();
1631 let bytes = self
1632 .values
1633 .iter()
1634 .cycle()
1635 .skip(byte_offset)
1636 .copied()
1637 .take(num_bytes)
1638 .collect::<Vec<_>>();
1639 let bytes = Buffer::from(bytes);
1640 let offsets = OffsetBuffer::from_lengths(lengths);
1641 self.idx = (self.idx + length.0 as usize) % self.lengths.len();
1642 Ok(Arc::new(arrow_array::GenericByteArray::<T>::new(
1643 offsets, bytes, None,
1644 )))
1645 }
1646
1647 fn data_type(&self) -> &DataType {
1648 &self.data_type
1649 }
1650
1651 fn element_size_bytes(&self) -> Option<ByteCount> {
1652 self.width
1653 }
1654}
1655
1656pub struct FixedBinaryGenerator<T: ByteArrayType> {
1657 value: Vec<u8>,
1658 data_type: DataType,
1659 array_type: PhantomData<T>,
1660}
1661
1662impl<T: ByteArrayType> std::fmt::Debug for FixedBinaryGenerator<T> {
1663 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1664 f.debug_struct("FixedBinaryGenerator")
1665 .field("value", &self.value)
1666 .field("data_type", &self.data_type)
1667 .finish()
1668 }
1669}
1670
1671impl<T: ByteArrayType> FixedBinaryGenerator<T> {
1672 pub fn new(value: Vec<u8>) -> Self {
1673 Self {
1674 value,
1675 data_type: T::DATA_TYPE,
1676 array_type: PhantomData,
1677 }
1678 }
1679}
1680
1681impl<T: ByteArrayType> ArrayGenerator for FixedBinaryGenerator<T> {
1682 fn generate(
1683 &mut self,
1684 length: RowCount,
1685 _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1686 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1687 let bytes = Buffer::from(Vec::from_iter(
1688 self.value
1689 .iter()
1690 .cycle()
1691 .take((length.0 * self.value.len() as u64) as usize)
1692 .copied(),
1693 ));
1694 let offsets =
1695 OffsetBuffer::from_lengths(iter::repeat_n(self.value.len(), length.0 as usize));
1696 Ok(Arc::new(arrow_array::GenericByteArray::<T>::new(
1697 offsets, bytes, None,
1698 )))
1699 }
1700
1701 fn data_type(&self) -> &DataType {
1702 &self.data_type
1703 }
1704
1705 fn element_size_bytes(&self) -> Option<ByteCount> {
1706 Some(ByteCount::from(
1708 self.value.len() as u64 + std::mem::size_of::<i32>() as u64,
1709 ))
1710 }
1711}
1712
1713pub struct DictionaryGenerator<K: ArrowDictionaryKeyType> {
1714 generator: Box<dyn ArrayGenerator>,
1715 data_type: DataType,
1716 key_type: PhantomData<K>,
1717 key_width: u64,
1718}
1719
1720impl<K: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryGenerator<K> {
1721 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1722 f.debug_struct("DictionaryGenerator")
1723 .field("generator", &self.generator)
1724 .field("data_type", &self.data_type)
1725 .field("key_width", &self.key_width)
1726 .finish()
1727 }
1728}
1729
1730impl<K: ArrowDictionaryKeyType> DictionaryGenerator<K> {
1731 fn new(generator: Box<dyn ArrayGenerator>) -> Self {
1732 let key_type = Box::new(K::DATA_TYPE);
1733 let key_width = key_type
1734 .primitive_width()
1735 .expect("dictionary key types should have a known width")
1736 as u64;
1737 let val_type = Box::new(generator.data_type().clone());
1738 let dict_type = DataType::Dictionary(key_type, val_type);
1739 Self {
1740 generator,
1741 data_type: dict_type,
1742 key_type: PhantomData,
1743 key_width,
1744 }
1745 }
1746}
1747
1748impl<K: ArrowDictionaryKeyType + Send + Sync> ArrayGenerator for DictionaryGenerator<K> {
1749 fn generate(
1750 &mut self,
1751 length: RowCount,
1752 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1753 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
1754 let underlying = self.generator.generate(length, rng)?;
1755 arrow_cast::cast::cast(&underlying, &self.data_type)
1756 }
1757
1758 fn data_type(&self) -> &DataType {
1759 &self.data_type
1760 }
1761
1762 fn element_size_bytes(&self) -> Option<ByteCount> {
1763 self.generator
1764 .element_size_bytes()
1765 .map(|size_bytes| ByteCount::from(size_bytes.0 + self.key_width))
1766 }
1767}
1768
1769struct LowCardinalityGenerator {
1772 inner: Box<dyn ArrayGenerator>,
1773 cardinality: usize,
1774 unique_values: Option<Arc<dyn Array>>,
1776}
1777
1778impl std::fmt::Debug for LowCardinalityGenerator {
1779 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1780 f.debug_struct("LowCardinalityGenerator")
1781 .field("inner", &self.inner)
1782 .field("cardinality", &self.cardinality)
1783 .field("initialized", &self.unique_values.is_some())
1784 .finish()
1785 }
1786}
1787
1788impl LowCardinalityGenerator {
1789 fn new(inner: Box<dyn ArrayGenerator>, cardinality: usize) -> Self {
1790 Self {
1791 inner,
1792 cardinality,
1793 unique_values: None,
1794 }
1795 }
1796}
1797
1798impl ArrayGenerator for LowCardinalityGenerator {
1799 fn generate(
1800 &mut self,
1801 length: RowCount,
1802 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1803 ) -> Result<Arc<dyn Array>, ArrowError> {
1804 if self.unique_values.is_none() {
1806 self.unique_values = Some(
1807 self.inner
1808 .generate(RowCount::from(self.cardinality as u64), rng)?,
1809 );
1810 }
1811
1812 let unique_values = self.unique_values.as_ref().unwrap();
1813
1814 let indices: Vec<usize> = (0..length.0)
1816 .map(|_| rng.random_range(0..self.cardinality))
1817 .collect();
1818
1819 let indices_array =
1821 arrow_array::UInt32Array::from(indices.iter().map(|&i| i as u32).collect::<Vec<_>>());
1822 arrow::compute::take(unique_values.as_ref(), &indices_array, None)
1823 .map(|arr| arr as Arc<dyn Array>)
1824 }
1825
1826 fn data_type(&self) -> &DataType {
1827 self.inner.data_type()
1828 }
1829
1830 fn element_size_bytes(&self) -> Option<ByteCount> {
1831 self.inner.element_size_bytes()
1832 }
1833}
1834
1835#[derive(Debug)]
1836struct RandomListGenerator {
1837 field: Arc<Field>,
1838 child_field: Arc<Field>,
1839 items_gen: Box<dyn ArrayGenerator>,
1840 lengths_gen: Box<dyn ArrayGenerator>,
1841 is_large: bool,
1842}
1843
1844impl RandomListGenerator {
1845 fn new(items_gen: Box<dyn ArrayGenerator>, is_large: bool) -> Self {
1847 let child_field = Arc::new(Field::new("item", items_gen.data_type().clone(), true));
1848 let list_type = if is_large {
1849 DataType::LargeList(child_field.clone())
1850 } else {
1851 DataType::List(child_field.clone())
1852 };
1853 let field = Field::new("", list_type, true);
1854 let lengths_gen = if is_large {
1855 let lengths_dist = Uniform::new_inclusive(0, 10).unwrap();
1856 rand_with_distribution::<Int64Type, Uniform<i64>>(lengths_dist)
1857 } else {
1858 let lengths_dist = Uniform::new_inclusive(0, 10).unwrap();
1859 rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist)
1860 };
1861 Self {
1862 field: Arc::new(field),
1863 child_field,
1864 items_gen,
1865 lengths_gen,
1866 is_large,
1867 }
1868 }
1869}
1870
1871impl ArrayGenerator for RandomListGenerator {
1872 fn generate(
1873 &mut self,
1874 length: RowCount,
1875 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1876 ) -> Result<Arc<dyn Array>, ArrowError> {
1877 let lengths = self.lengths_gen.generate(length, rng)?;
1878 if self.is_large {
1879 let lengths = lengths.as_primitive::<Int64Type>();
1880 let total_length = lengths.values().iter().sum::<i64>() as u64;
1881 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1882 let items = self.items_gen.generate(RowCount::from(total_length), rng)?;
1883 Ok(Arc::new(LargeListArray::try_new(
1884 self.child_field.clone(),
1885 offsets,
1886 items,
1887 None,
1888 )?))
1889 } else {
1890 let lengths = lengths.as_primitive::<Int32Type>();
1891 let total_length = lengths.values().iter().sum::<i32>() as u64;
1892 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1893 let items = self.items_gen.generate(RowCount::from(total_length), rng)?;
1894 Ok(Arc::new(ListArray::try_new(
1895 self.child_field.clone(),
1896 offsets,
1897 items,
1898 None,
1899 )?))
1900 }
1901 }
1902
1903 fn data_type(&self) -> &DataType {
1904 self.field.data_type()
1905 }
1906
1907 fn element_size_bytes(&self) -> Option<ByteCount> {
1908 None
1909 }
1910}
1911
1912#[derive(Debug)]
1914struct RandomMapGenerator {
1915 field: Arc<Field>,
1916 entries_field: Arc<Field>,
1917 keys_gen: Box<dyn ArrayGenerator>,
1918 values_gen: Box<dyn ArrayGenerator>,
1919 lengths_gen: Box<dyn ArrayGenerator>,
1920}
1921
1922impl RandomMapGenerator {
1923 fn new(keys_gen: Box<dyn ArrayGenerator>, values_gen: Box<dyn ArrayGenerator>) -> Self {
1924 let entries_fields = Fields::from(vec![
1925 Field::new("keys", keys_gen.data_type().clone(), false),
1926 Field::new("values", values_gen.data_type().clone(), true),
1927 ]);
1928 let entries_field = Arc::new(Field::new(
1929 "entries",
1930 DataType::Struct(entries_fields),
1931 false,
1932 ));
1933 let map_type = DataType::Map(entries_field.clone(), false);
1934 let field = Arc::new(Field::new("", map_type, true));
1935 let lengths_dist = Uniform::new_inclusive(0_i32, 4).unwrap();
1936 let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist);
1937
1938 Self {
1939 field,
1940 entries_field,
1941 keys_gen,
1942 values_gen,
1943 lengths_gen,
1944 }
1945 }
1946}
1947
1948impl ArrayGenerator for RandomMapGenerator {
1949 fn generate(
1950 &mut self,
1951 length: RowCount,
1952 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
1953 ) -> Result<Arc<dyn Array>, ArrowError> {
1954 let lengths = self.lengths_gen.generate(length, rng)?;
1955 let lengths = lengths.as_primitive::<Int32Type>();
1956 let total_entries = lengths.values().iter().sum::<i32>() as u64;
1957 let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize));
1958
1959 let keys = self.keys_gen.generate(RowCount::from(total_entries), rng)?;
1960 let values = self
1961 .values_gen
1962 .generate(RowCount::from(total_entries), rng)?;
1963
1964 let entries = StructArray::new(
1965 Fields::from(vec![
1966 Field::new("keys", keys.data_type().clone(), false),
1967 Field::new("values", values.data_type().clone(), true),
1968 ]),
1969 vec![keys, values],
1970 None,
1971 );
1972
1973 Ok(Arc::new(MapArray::try_new(
1974 self.entries_field.clone(),
1975 offsets,
1976 entries,
1977 None,
1978 false,
1979 )?))
1980 }
1981
1982 fn data_type(&self) -> &DataType {
1983 self.field.data_type()
1984 }
1985
1986 fn element_size_bytes(&self) -> Option<ByteCount> {
1987 None
1988 }
1989}
1990
1991#[derive(Debug)]
1992struct NullArrayGenerator {}
1993
1994impl ArrayGenerator for NullArrayGenerator {
1995 fn generate(
1996 &mut self,
1997 length: RowCount,
1998 _: &mut rand_xoshiro::Xoshiro256PlusPlus,
1999 ) -> Result<Arc<dyn Array>, ArrowError> {
2000 Ok(Arc::new(NullArray::new(length.0 as usize)))
2001 }
2002
2003 fn data_type(&self) -> &DataType {
2004 &DataType::Null
2005 }
2006
2007 fn element_size_bytes(&self) -> Option<ByteCount> {
2008 None
2009 }
2010}
2011
2012#[derive(Debug)]
2014struct RadialStepGenerator {
2015 num_steps_per_circle: u32,
2016 data_field: Arc<Field>,
2017 data_type: DataType,
2018 current_step: u32,
2019}
2020
2021impl RadialStepGenerator {
2022 fn new(num_steps_per_circle: u32) -> Self {
2023 let data_field = Arc::new(Field::new("item", DataType::Float32, false));
2024 let data_type = DataType::FixedSizeList(data_field.clone(), 2);
2025 Self {
2026 num_steps_per_circle,
2027 data_field,
2028 data_type,
2029 current_step: 0,
2030 }
2031 }
2032}
2033
2034impl ArrayGenerator for RadialStepGenerator {
2035 fn generate(
2036 &mut self,
2037 length: RowCount,
2038 _rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
2039 ) -> Result<Arc<dyn Array>, ArrowError> {
2040 let mut values_builder = Float32Builder::with_capacity(length.0 as usize * 2);
2041 for _ in 0..length.0 {
2042 let angle = (self.current_step as f32) / (self.num_steps_per_circle as f32)
2043 * 2.0
2044 * std::f32::consts::PI;
2045 values_builder.append_value(angle.cos());
2046 values_builder.append_value(angle.sin());
2047 self.current_step = (self.current_step + 1) % self.num_steps_per_circle;
2048 }
2049 let values = values_builder.finish();
2050 let vectors =
2051 FixedSizeListArray::try_new(self.data_field.clone(), 2, Arc::new(values), None)?;
2052 Ok(Arc::new(vectors))
2053 }
2054
2055 fn data_type(&self) -> &DataType {
2056 &self.data_type
2057 }
2058
2059 fn element_size_bytes(&self) -> Option<ByteCount> {
2060 Some(ByteCount::from(8))
2061 }
2062}
2063
2064#[derive(Debug)]
2066struct JitterCentroidsGenerator {
2067 centroids: Float32Array,
2068 dimension: u32,
2069 noise_level: f32,
2070 data_type: DataType,
2071 data_field: Arc<Field>,
2072
2073 offset: usize,
2074}
2075
2076impl JitterCentroidsGenerator {
2077 fn try_new(centroids: Arc<dyn Array>, noise_level: f32) -> Result<Self, ArrowError> {
2078 let DataType::FixedSizeList(values_field, dimension) = centroids.data_type() else {
2079 return Err(ArrowError::InvalidArgumentError(
2080 "Centroids must be a FixedSizeList".to_string(),
2081 ));
2082 };
2083 if values_field.data_type() != &DataType::Float32 {
2084 return Err(ArrowError::InvalidArgumentError(
2085 "Centroids values must be a Float32".to_string(),
2086 ));
2087 }
2088 let data_type = DataType::FixedSizeList(values_field.clone(), *dimension);
2089 Ok(Self {
2090 centroids: centroids
2091 .as_fixed_size_list()
2092 .values()
2093 .as_primitive::<Float32Type>()
2094 .clone(),
2095 dimension: *dimension as u32,
2096 noise_level,
2097 data_type,
2098 data_field: values_field.clone(),
2099 offset: 0,
2100 })
2101 }
2102}
2103
2104impl ArrayGenerator for JitterCentroidsGenerator {
2105 fn generate(
2106 &mut self,
2107 length: RowCount,
2108 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
2109 ) -> Result<Arc<dyn Array>, ArrowError> {
2110 let mut values_builder =
2111 Float32Builder::with_capacity(length.0 as usize * self.dimension as usize);
2112 for _ in 0..length.0 {
2113 let mut noise = (0..self.dimension as usize)
2115 .map(|_| rng.random::<f32>())
2116 .collect::<Vec<_>>();
2117 let scale = self.noise_level / noise.iter().map(|v| v * v).sum::<f32>().sqrt();
2119 noise.iter_mut().for_each(|v| *v *= scale);
2120
2121 for (i, noise) in noise.into_iter().enumerate() {
2123 let centroid_val = self.centroids.value(self.offset + i);
2124 let jittered_val = centroid_val + noise;
2125 values_builder.append_value(jittered_val);
2126 }
2127 self.offset = (self.offset + self.dimension as usize) % self.centroids.len();
2129 }
2130 let values = values_builder.finish();
2131 let vectors = FixedSizeListArray::try_new(
2132 self.data_field.clone(),
2133 self.dimension as i32,
2134 Arc::new(values),
2135 None,
2136 )?;
2137 Ok(Arc::new(vectors))
2138 }
2139
2140 fn data_type(&self) -> &DataType {
2141 &self.data_type
2142 }
2143
2144 fn element_size_bytes(&self) -> Option<ByteCount> {
2145 Some(ByteCount::from(self.dimension as u64 * 4))
2146 }
2147}
2148#[derive(Debug)]
2149struct RandomStructGenerator {
2150 fields: Fields,
2151 data_type: DataType,
2152 child_gens: Vec<Box<dyn ArrayGenerator>>,
2153}
2154
2155impl RandomStructGenerator {
2156 fn new(fields: Fields, child_gens: Vec<Box<dyn ArrayGenerator>>) -> Self {
2157 let data_type = DataType::Struct(fields.clone());
2158 Self {
2159 fields,
2160 data_type,
2161 child_gens,
2162 }
2163 }
2164}
2165
2166impl ArrayGenerator for RandomStructGenerator {
2167 fn generate(
2168 &mut self,
2169 length: RowCount,
2170 rng: &mut rand_xoshiro::Xoshiro256PlusPlus,
2171 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
2172 if self.child_gens.is_empty() {
2173 let struct_arr = StructArray::new_empty_fields(length.0 as usize, None);
2176 return Ok(Arc::new(struct_arr));
2177 }
2178 let child_arrays = self
2179 .child_gens
2180 .iter_mut()
2181 .map(|genn| genn.generate(length, rng))
2182 .collect::<Result<Vec<_>, ArrowError>>()?;
2183 let struct_arr = StructArray::new(self.fields.clone(), child_arrays, None);
2184 Ok(Arc::new(struct_arr))
2185 }
2186
2187 fn data_type(&self) -> &DataType {
2188 &self.data_type
2189 }
2190
2191 fn element_size_bytes(&self) -> Option<ByteCount> {
2192 let mut sum = 0;
2193 for child_gen in &self.child_gens {
2194 sum += child_gen.element_size_bytes()?.0;
2195 }
2196 Some(ByteCount::from(sum))
2197 }
2198}
2199
2200pub struct FixedSizeBatchGenerator {
2202 rng: rand_xoshiro::Xoshiro256PlusPlus,
2203 generators: Vec<Box<dyn ArrayGenerator>>,
2204 batch_size: RowCount,
2205 num_batches: BatchCount,
2206 schema: SchemaRef,
2207}
2208
2209impl FixedSizeBatchGenerator {
2210 fn new(
2211 generators: Vec<(Option<String>, Box<dyn ArrayGenerator>)>,
2212 batch_size: RowCount,
2213 num_batches: BatchCount,
2214 seed: Option<Seed>,
2215 default_null_probability: Option<f64>,
2216 ) -> Self {
2217 let mut fields = Vec::with_capacity(generators.len());
2218 for (field_index, field_gen) in generators.iter().enumerate() {
2219 let (name, genn) = field_gen;
2220 let default_name = format!("field_{}", field_index);
2221 let name = name.clone().unwrap_or(default_name);
2222 let mut field = Field::new(name, genn.data_type().clone(), true);
2223 if let Some(metadata) = genn.metadata() {
2224 field = field.with_metadata(metadata);
2225 }
2226 fields.push(field);
2227 }
2228 let mut generators = generators
2229 .into_iter()
2230 .map(|(_, genn)| genn)
2231 .collect::<Vec<_>>();
2232 if let Some(null_probability) = default_null_probability {
2233 generators = generators
2234 .into_iter()
2235 .map(|genn| genn.with_random_nulls(null_probability))
2236 .collect();
2237 }
2238 let schema = Arc::new(Schema::new(fields));
2239 Self {
2240 rng: rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
2241 seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
2242 ),
2243 generators,
2244 batch_size,
2245 num_batches,
2246 schema,
2247 }
2248 }
2249
2250 fn gen_next(&mut self) -> Result<RecordBatch, ArrowError> {
2251 let mut arrays = Vec::with_capacity(self.generators.len());
2252 for genn in self.generators.iter_mut() {
2253 let arr = genn.generate(self.batch_size, &mut self.rng)?;
2254 arrays.push(arr);
2255 }
2256 self.num_batches.0 -= 1;
2257 Ok(RecordBatch::try_new_with_options(
2258 self.schema.clone(),
2259 arrays,
2260 &RecordBatchOptions::new().with_row_count(Some(self.batch_size.0 as usize)),
2261 )
2262 .unwrap())
2263 }
2264}
2265
2266impl Iterator for FixedSizeBatchGenerator {
2267 type Item = Result<RecordBatch, ArrowError>;
2268
2269 fn next(&mut self) -> Option<Self::Item> {
2270 if self.num_batches.0 == 0 {
2271 return None;
2272 }
2273 Some(self.gen_next())
2274 }
2275}
2276
2277impl RecordBatchReader for FixedSizeBatchGenerator {
2278 fn schema(&self) -> SchemaRef {
2279 self.schema.clone()
2280 }
2281}
2282
2283#[derive(Default)]
2288pub struct BatchGeneratorBuilder {
2289 generators: Vec<(Option<String>, Box<dyn ArrayGenerator>)>,
2290 default_null_probability: Option<f64>,
2291 seed: Option<Seed>,
2292}
2293
2294pub enum RoundingBehavior {
2295 ExactOrErr,
2296 RoundUp,
2297 RoundDown,
2298}
2299
2300impl BatchGeneratorBuilder {
2301 pub fn new() -> Self {
2303 Default::default()
2304 }
2305
2306 pub fn new_with_seed(seed: Seed) -> Self {
2308 Self {
2309 seed: Some(seed),
2310 ..Default::default()
2311 }
2312 }
2313
2314 pub fn col(mut self, name: impl Into<String>, genn: Box<dyn ArrayGenerator>) -> Self {
2318 self.generators.push((Some(name.into()), genn));
2319 self
2320 }
2321
2322 pub fn anon_col(mut self, genn: Box<dyn ArrayGenerator>) -> Self {
2326 self.generators.push((None, genn));
2327 self
2328 }
2329
2330 pub fn into_batch_rows(self, batch_size: RowCount) -> Result<RecordBatch, ArrowError> {
2331 let mut reader = self.into_reader_rows(batch_size, BatchCount::from(1));
2332 reader
2333 .next()
2334 .expect("Asked for 1 batch but reader was empty")
2335 }
2336
2337 pub fn into_batch_bytes(
2338 self,
2339 batch_size: ByteCount,
2340 rounding: RoundingBehavior,
2341 ) -> Result<RecordBatch, ArrowError> {
2342 let mut reader = self.into_reader_bytes(batch_size, BatchCount::from(1), rounding)?;
2343 reader
2344 .next()
2345 .expect("Asked for 1 batch but reader was empty")
2346 }
2347
2348 pub fn into_reader_rows(
2350 self,
2351 batch_size: RowCount,
2352 num_batches: BatchCount,
2353 ) -> impl RecordBatchReader {
2354 FixedSizeBatchGenerator::new(
2355 self.generators,
2356 batch_size,
2357 num_batches,
2358 self.seed,
2359 self.default_null_probability,
2360 )
2361 }
2362
2363 pub fn into_reader_stream(
2364 self,
2365 batch_size: RowCount,
2366 num_batches: BatchCount,
2367 ) -> (
2368 BoxStream<'static, Result<RecordBatch, ArrowError>>,
2369 Arc<Schema>,
2370 ) {
2371 let reader = self.into_reader_rows(batch_size, num_batches);
2373 let schema = reader.schema();
2374 let batches = reader.collect::<Vec<_>>();
2375 (futures::stream::iter(batches).boxed(), schema)
2376 }
2377
2378 pub fn into_reader_bytes(
2380 self,
2381 batch_size_bytes: ByteCount,
2382 num_batches: BatchCount,
2383 rounding: RoundingBehavior,
2384 ) -> Result<impl RecordBatchReader, ArrowError> {
2385 let bytes_per_row = self
2386 .generators
2387 .iter()
2388 .map(|genn| genn.1.element_size_bytes().map(|byte_count| byte_count.0).ok_or(
2389 ArrowError::NotYetImplemented("The function into_reader_bytes currently requires each array generator to have a fixed element size".to_string())
2390 )
2391 )
2392 .sum::<Result<u64, ArrowError>>()?;
2393 let mut num_rows = RowCount::from(batch_size_bytes.0 / bytes_per_row);
2394 if !batch_size_bytes.0.is_multiple_of(bytes_per_row) {
2395 match rounding {
2396 RoundingBehavior::ExactOrErr => {
2397 return Err(ArrowError::NotYetImplemented(format!(
2398 "Exact rounding requested but not possible. Batch size requested {}, row size: {}",
2399 batch_size_bytes.0, bytes_per_row
2400 )));
2401 }
2402 RoundingBehavior::RoundUp => {
2403 num_rows = RowCount::from(num_rows.0 + 1);
2404 }
2405 RoundingBehavior::RoundDown => (),
2406 }
2407 }
2408 Ok(self.into_reader_rows(num_rows, num_batches))
2409 }
2410
2411 pub fn with_seed(mut self, seed: Seed) -> Self {
2413 self.seed = Some(seed);
2414 self
2415 }
2416
2417 pub fn with_random_nulls(&mut self, default_null_probability: f64) {
2419 self.default_null_probability = Some(default_null_probability);
2420 }
2421}
2422
2423pub struct ArrayGeneratorBuilder {
2425 generator: Box<dyn ArrayGenerator>,
2426 seed: Option<Seed>,
2427}
2428
2429impl ArrayGeneratorBuilder {
2430 fn new(generator: Box<dyn ArrayGenerator>) -> Self {
2431 Self {
2432 generator,
2433 seed: None,
2434 }
2435 }
2436
2437 pub fn with_seed(mut self, seed: Seed) -> Self {
2439 self.seed = Some(seed);
2440 self
2441 }
2442
2443 pub fn into_array_rows(
2445 mut self,
2446 length: RowCount,
2447 ) -> Result<Arc<dyn arrow_array::Array>, ArrowError> {
2448 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(
2449 self.seed.map(|s| s.0).unwrap_or(DEFAULT_SEED.0),
2450 );
2451 self.generator.generate(length, &mut rng)
2452 }
2453}
2454
2455const MS_PER_DAY: i64 = 86400000;
2456
2457pub mod array {
2458
2459 use arrow::datatypes::{Int8Type, Int16Type, Int64Type};
2460 use arrow_array::types::{
2461 Decimal128Type, Decimal256Type, DurationMicrosecondType, DurationMillisecondType,
2462 DurationNanosecondType, DurationSecondType, Float16Type, Float32Type, Float64Type,
2463 UInt8Type, UInt16Type, UInt32Type, UInt64Type,
2464 };
2465 use arrow_array::{
2466 ArrowNativeTypeOp, BooleanArray, Date32Array, Date64Array, Time32MillisecondArray,
2467 Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
2468 TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
2469 TimestampSecondArray,
2470 };
2471 use arrow_schema::{IntervalUnit, TimeUnit};
2472 use chrono::Utc;
2473 use rand::prelude::Distribution;
2474
2475 use super::*;
2476
2477 pub fn cycle_vec(
2482 generator: Box<dyn ArrayGenerator>,
2483 dimension: Dimension,
2484 ) -> Box<dyn ArrayGenerator> {
2485 Box::new(CycleVectorGenerator::new(generator, dimension))
2486 }
2487
2488 pub fn cycle_vec_var(
2493 generator: Box<dyn ArrayGenerator>,
2494 min_list_size: Dimension,
2495 max_list_size: Dimension,
2496 ) -> Box<dyn ArrayGenerator> {
2497 Box::new(CycleListGenerator::new(
2498 generator,
2499 min_list_size,
2500 max_list_size,
2501 ))
2502 }
2503
2504 pub fn cycle_unit_circle(num_steps: u32) -> Box<dyn ArrayGenerator> {
2509 Box::new(RadialStepGenerator::new(num_steps))
2510 }
2511
2512 pub fn jitter_centroids(centroids: Arc<dyn Array>, jitter: f32) -> Box<dyn ArrayGenerator> {
2516 Box::new(JitterCentroidsGenerator::try_new(centroids, jitter).unwrap())
2517 }
2518
2519 pub fn cycle<DataType>(values: Vec<DataType::Native>) -> Box<dyn ArrayGenerator>
2524 where
2525 DataType::Native: Copy + 'static,
2526 DataType: ArrowPrimitiveType,
2527 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2528 {
2529 let mut values_idx = 0;
2530 Box::new(
2531 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2532 DataType::DATA_TYPE,
2533 move |_| {
2534 let y = values[values_idx];
2535 values_idx = (values_idx + 1) % values.len();
2536 y
2537 },
2538 1,
2539 DataType::DATA_TYPE
2540 .primitive_width()
2541 .map(|width| ByteCount::from(width as u64))
2542 .expect("Primitive types should have a fixed width"),
2543 ),
2544 )
2545 }
2546
2547 pub fn cycle_bool(values: Vec<bool>) -> Box<dyn ArrayGenerator> {
2552 let mut values_idx = 0;
2553 Box::new(FnGen::<bool, BooleanArray, _>::new_unknown_size(
2554 DataType::Boolean,
2555 move |_| {
2556 let val = values[values_idx];
2557 values_idx = (values_idx + 1) % values.len();
2558 val
2559 },
2560 1,
2561 ))
2562 }
2563
2564 pub fn step<DataType>() -> Box<dyn ArrayGenerator>
2566 where
2567 DataType::Native: Copy + Default + std::ops::AddAssign<DataType::Native> + 'static,
2568 DataType: ArrowPrimitiveType,
2569 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2570 {
2571 let mut x = DataType::Native::default();
2572 Box::new(
2573 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2574 DataType::DATA_TYPE,
2575 move |_| {
2576 let y = x;
2577 x += DataType::Native::ONE;
2578 y
2579 },
2580 1,
2581 DataType::DATA_TYPE
2582 .primitive_width()
2583 .map(|width| ByteCount::from(width as u64))
2584 .expect("Primitive types should have a fixed width"),
2585 ),
2586 )
2587 }
2588
2589 pub fn blob() -> Box<dyn ArrayGenerator> {
2590 let mut blob_meta = HashMap::new();
2591 blob_meta.insert("lance-encoding:blob".to_string(), "true".to_string());
2592 rand_fixedbin(ByteCount::from(4 * 1024 * 1024), true).with_metadata(blob_meta)
2593 }
2594
2595 pub fn step_custom<DataType>(
2597 start: DataType::Native,
2598 step: DataType::Native,
2599 ) -> Box<dyn ArrayGenerator>
2600 where
2601 DataType::Native: Copy + Default + std::ops::AddAssign<DataType::Native> + 'static,
2602 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2603 DataType: ArrowPrimitiveType,
2604 {
2605 let mut x = start;
2606 Box::new(
2607 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2608 DataType::DATA_TYPE,
2609 move |_| {
2610 let y = x;
2611 x += step;
2612 y
2613 },
2614 1,
2615 DataType::DATA_TYPE
2616 .primitive_width()
2617 .map(|width| ByteCount::from(width as u64))
2618 .expect("Primitive types should have a fixed width"),
2619 ),
2620 )
2621 }
2622
2623 pub fn fill<DataType>(value: DataType::Native) -> Box<dyn ArrayGenerator>
2625 where
2626 DataType::Native: Copy + 'static,
2627 DataType: ArrowPrimitiveType,
2628 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2629 {
2630 Box::new(
2631 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2632 DataType::DATA_TYPE,
2633 move |_| value,
2634 1,
2635 DataType::DATA_TYPE
2636 .primitive_width()
2637 .map(|width| ByteCount::from(width as u64))
2638 .expect("Primitive types should have a fixed width"),
2639 ),
2640 )
2641 }
2642
2643 pub fn fill_varbin(value: Vec<u8>) -> Box<dyn ArrayGenerator> {
2645 Box::new(FixedBinaryGenerator::<BinaryType>::new(value))
2646 }
2647
2648 pub fn fill_utf8(value: String) -> Box<dyn ArrayGenerator> {
2650 Box::new(FixedBinaryGenerator::<Utf8Type>::new(value.into_bytes()))
2651 }
2652
2653 pub fn cycle_utf8_literals(values: &[&'static str]) -> Box<dyn ArrayGenerator> {
2654 Box::new(CycleBinaryGenerator::<Utf8Type>::from_strings(values))
2655 }
2656
2657 pub fn rand<DataType>() -> Box<dyn ArrayGenerator>
2659 where
2660 DataType::Native: Copy + 'static,
2661 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2662 DataType: ArrowPrimitiveType,
2663 rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2664 {
2665 Box::new(
2666 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2667 DataType::DATA_TYPE,
2668 move |rng| rng.random(),
2669 1,
2670 DataType::DATA_TYPE
2671 .primitive_width()
2672 .map(|width| ByteCount::from(width as u64))
2673 .expect("Primitive types should have a fixed width"),
2674 ),
2675 )
2676 }
2677
2678 pub fn rand_with_distribution<
2680 DataType,
2681 Dist: rand::distr::Distribution<DataType::Native> + Clone + Send + Sync + 'static,
2682 >(
2683 dist: Dist,
2684 ) -> Box<dyn ArrayGenerator>
2685 where
2686 DataType::Native: Copy + 'static,
2687 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2688 DataType: ArrowPrimitiveType,
2689 {
2690 Box::new(
2691 FnGen::<DataType::Native, PrimitiveArray<DataType>, _>::new_known_size(
2692 DataType::DATA_TYPE,
2693 move |rng| rng.sample(dist.clone()),
2694 1,
2695 DataType::DATA_TYPE
2696 .primitive_width()
2697 .map(|width| ByteCount::from(width as u64))
2698 .expect("Primitive types should have a fixed width"),
2699 ),
2700 )
2701 }
2702
2703 pub fn rand_vec<DataType>(dimension: Dimension) -> Box<dyn ArrayGenerator>
2705 where
2706 DataType::Native: Copy + 'static,
2707 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2708 DataType: ArrowPrimitiveType,
2709 rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2710 {
2711 let underlying = rand::<DataType>();
2712 cycle_vec(underlying, dimension)
2713 }
2714
2715 pub fn rand_vec_nullable<DataType>(
2717 dimension: Dimension,
2718 null_probability: f64,
2719 ) -> Box<dyn ArrayGenerator>
2720 where
2721 DataType::Native: Copy + 'static,
2722 PrimitiveArray<DataType>: From<Vec<DataType::Native>> + 'static,
2723 DataType: ArrowPrimitiveType,
2724 rand::distr::StandardUniform: rand::distr::Distribution<DataType::Native>,
2725 {
2726 let underlying = rand::<DataType>().with_random_nulls(null_probability);
2727 cycle_vec(underlying, dimension)
2728 }
2729
2730 pub fn rand_time32(resolution: &TimeUnit) -> Box<dyn ArrayGenerator> {
2733 let start = 0;
2734 let end = match resolution {
2735 TimeUnit::Second => 86_400,
2736 TimeUnit::Millisecond => 86_400_000,
2737 _ => panic!(),
2738 };
2739
2740 let data_type = DataType::Time32(*resolution);
2741 let size = ByteCount::from(data_type.primitive_width().unwrap() as u64);
2742 let dist = Uniform::new(start, end).unwrap();
2743 let sample_fn = move |rng: &mut _| dist.sample(rng);
2744
2745 match resolution {
2746 TimeUnit::Second => Box::new(FnGen::<i32, Time32SecondArray, _>::new_known_size(
2747 data_type, sample_fn, 1, size,
2748 )),
2749 TimeUnit::Millisecond => {
2750 Box::new(FnGen::<i32, Time32MillisecondArray, _>::new_known_size(
2751 data_type, sample_fn, 1, size,
2752 ))
2753 }
2754 _ => panic!(),
2755 }
2756 }
2757
2758 pub fn rand_time64(resolution: &TimeUnit) -> Box<dyn ArrayGenerator> {
2761 let start = 0_i64;
2762 let end: i64 = match resolution {
2763 TimeUnit::Microsecond => 86_400_000,
2764 TimeUnit::Nanosecond => 86_400_000_000,
2765 _ => panic!(),
2766 };
2767
2768 let data_type = DataType::Time64(*resolution);
2769 let size = ByteCount::from(data_type.primitive_width().unwrap() as u64);
2770 let dist = Uniform::new(start, end).unwrap();
2771 let sample_fn = move |rng: &mut _| dist.sample(rng);
2772
2773 match resolution {
2774 TimeUnit::Microsecond => {
2775 Box::new(FnGen::<i64, Time64MicrosecondArray, _>::new_known_size(
2776 data_type, sample_fn, 1, size,
2777 ))
2778 }
2779 TimeUnit::Nanosecond => {
2780 Box::new(FnGen::<i64, Time64NanosecondArray, _>::new_known_size(
2781 data_type, sample_fn, 1, size,
2782 ))
2783 }
2784 _ => panic!(),
2785 }
2786 }
2787
2788 pub fn rand_pseudo_uuid() -> Box<dyn ArrayGenerator> {
2794 Box::<PseudoUuidGenerator>::default()
2795 }
2796
2797 pub fn rand_pseudo_uuid_hex() -> Box<dyn ArrayGenerator> {
2804 Box::<PseudoUuidHexGenerator>::default()
2805 }
2806
2807 pub fn rand_primitive<T: ArrowPrimitiveType + Send + Sync>(
2808 data_type: DataType,
2809 ) -> Box<dyn ArrayGenerator> {
2810 Box::new(RandomBytesGenerator::<T>::new(data_type))
2811 }
2812
2813 pub fn rand_fsb(size: i32) -> Box<dyn ArrayGenerator> {
2814 Box::new(RandomFixedSizeBinaryGenerator::new(size))
2815 }
2816
2817 pub fn rand_interval(unit: IntervalUnit) -> Box<dyn ArrayGenerator> {
2818 Box::new(RandomIntervalGenerator::new(unit))
2819 }
2820
2821 pub fn rand_date32() -> Box<dyn ArrayGenerator> {
2826 let now = chrono::Utc::now();
2827 let one_year_ago = now - chrono::TimeDelta::try_days(365).expect("TimeDelta try days");
2828 rand_date32_in_range(one_year_ago, now)
2829 }
2830
2831 pub fn rand_date32_in_range(
2833 start: chrono::DateTime<Utc>,
2834 end: chrono::DateTime<Utc>,
2835 ) -> Box<dyn ArrayGenerator> {
2836 let data_type = DataType::Date32;
2837 let end_ms = end.timestamp_millis();
2838 let end_days = (end_ms / MS_PER_DAY) as i32;
2839 let start_ms = start.timestamp_millis();
2840 let start_days = (start_ms / MS_PER_DAY) as i32;
2841 let dist = Uniform::new(start_days, end_days).unwrap();
2842
2843 Box::new(FnGen::<i32, Date32Array, _>::new_known_size(
2844 data_type,
2845 move |rng| dist.sample(rng),
2846 1,
2847 DataType::Date32
2848 .primitive_width()
2849 .map(|width| ByteCount::from(width as u64))
2850 .expect("Date32 should have a fixed width"),
2851 ))
2852 }
2853
2854 pub fn rand_date64() -> Box<dyn ArrayGenerator> {
2859 let now = chrono::Utc::now();
2860 let one_year_ago = now - chrono::TimeDelta::try_days(365).expect("TimeDelta try_days");
2861 rand_date64_in_range(one_year_ago, now)
2862 }
2863
2864 pub fn rand_timestamp_in_range(
2868 start: chrono::DateTime<Utc>,
2869 end: chrono::DateTime<Utc>,
2870 data_type: &DataType,
2871 ) -> Box<dyn ArrayGenerator> {
2872 let end_ms = end.timestamp_millis();
2873 let start_ms = start.timestamp_millis();
2874 let (start_ticks, end_ticks) = match data_type {
2875 DataType::Timestamp(TimeUnit::Nanosecond, _) => {
2876 (start_ms * 1000 * 1000, end_ms * 1000 * 1000)
2877 }
2878 DataType::Timestamp(TimeUnit::Microsecond, _) => (start_ms * 1000, end_ms * 1000),
2879 DataType::Timestamp(TimeUnit::Millisecond, _) => (start_ms, end_ms),
2880 DataType::Timestamp(TimeUnit::Second, _) => (start.timestamp(), end.timestamp()),
2881 _ => panic!(),
2882 };
2883 let dist = Uniform::new(start_ticks, end_ticks).unwrap();
2884
2885 let data_type = data_type.clone();
2886 let sample_fn = move |rng: &mut _| dist.sample(rng);
2887 let width = data_type
2888 .primitive_width()
2889 .map(|width| ByteCount::from(width as u64))
2890 .unwrap();
2891
2892 match data_type {
2893 DataType::Timestamp(TimeUnit::Nanosecond, _) => {
2894 Box::new(FnGen::<i64, TimestampNanosecondArray, _>::new_known_size(
2895 data_type, sample_fn, 1, width,
2896 ))
2897 }
2898 DataType::Timestamp(TimeUnit::Microsecond, _) => {
2899 Box::new(FnGen::<i64, TimestampMicrosecondArray, _>::new_known_size(
2900 data_type, sample_fn, 1, width,
2901 ))
2902 }
2903 DataType::Timestamp(TimeUnit::Millisecond, _) => {
2904 Box::new(FnGen::<i64, TimestampMillisecondArray, _>::new_known_size(
2905 data_type, sample_fn, 1, width,
2906 ))
2907 }
2908 DataType::Timestamp(TimeUnit::Second, _) => {
2909 Box::new(FnGen::<i64, TimestampSecondArray, _>::new_known_size(
2910 data_type, sample_fn, 1, width,
2911 ))
2912 }
2913 _ => panic!(),
2914 }
2915 }
2916
2917 pub fn rand_timestamp(data_type: &DataType) -> Box<dyn ArrayGenerator> {
2918 let now = chrono::Utc::now();
2919 let one_year_ago = now - chrono::Duration::try_days(365).unwrap();
2920 rand_timestamp_in_range(one_year_ago, now, data_type)
2921 }
2922
2923 pub fn rand_date64_in_range(
2928 start: chrono::DateTime<Utc>,
2929 end: chrono::DateTime<Utc>,
2930 ) -> Box<dyn ArrayGenerator> {
2931 let data_type = DataType::Date64;
2932 let end_ms = end.timestamp_millis();
2933 let end_days = end_ms / MS_PER_DAY;
2934 let start_ms = start.timestamp_millis();
2935 let start_days = start_ms / MS_PER_DAY;
2936 let dist = Uniform::new(start_days, end_days).unwrap();
2937
2938 Box::new(FnGen::<i64, Date64Array, _>::new_known_size(
2939 data_type,
2940 move |rng| (dist.sample(rng)) * MS_PER_DAY,
2941 1,
2942 DataType::Date64
2943 .primitive_width()
2944 .map(|width| ByteCount::from(width as u64))
2945 .expect("Date64 should have a fixed width"),
2946 ))
2947 }
2948
2949 pub fn rand_fixedbin(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
2951 Box::new(RandomBinaryGenerator::new(
2952 bytes_per_element,
2953 false,
2954 is_large,
2955 ))
2956 }
2957
2958 pub fn rand_varbin(
2962 min_bytes_per_element: ByteCount,
2963 max_bytes_per_element: ByteCount,
2964 ) -> Box<dyn ArrayGenerator> {
2965 Box::new(VariableRandomBinaryGenerator::new(
2966 min_bytes_per_element,
2967 max_bytes_per_element,
2968 ))
2969 }
2970
2971 pub fn rand_utf8(bytes_per_element: ByteCount, is_large: bool) -> Box<dyn ArrayGenerator> {
2975 Box::new(RandomBinaryGenerator::new(
2976 bytes_per_element,
2977 true,
2978 is_large,
2979 ))
2980 }
2981
2982 pub fn utf8_prefix_plus_counter(
2986 prefix: impl Into<String>,
2987 is_large: bool,
2988 ) -> Box<dyn ArrayGenerator> {
2989 Box::new(PrefixPlusCounterGenerator::new(prefix.into(), is_large))
2990 }
2991
2992 pub fn binary_prefix_plus_counter(
2993 prefix: Arc<[u8]>,
2994 is_large: bool,
2995 ) -> Box<dyn ArrayGenerator> {
2996 Box::new(BinaryPrefixPlusCounterGenerator::new(prefix, is_large))
2997 }
2998
2999 pub fn rand_boolean() -> Box<dyn ArrayGenerator> {
3001 Box::<RandomBooleanGenerator>::default()
3002 }
3003
3004 pub fn random_sentence(
3008 min_words: usize,
3009 max_words: usize,
3010 is_large: bool,
3011 ) -> Box<dyn ArrayGenerator> {
3012 Box::new(RandomSentenceGenerator::new(min_words, max_words, is_large))
3013 }
3014
3015 pub fn random_word(is_large: bool) -> Box<dyn ArrayGenerator> {
3019 Box::new(RandomWordGenerator::new(is_large))
3020 }
3021
3022 pub fn rand_list(item_type: &DataType, is_large: bool) -> Box<dyn ArrayGenerator> {
3023 let child_gen = rand_type(item_type);
3024 Box::new(RandomListGenerator::new(child_gen, is_large))
3025 }
3026
3027 pub fn rand_list_any(
3028 item_gen: Box<dyn ArrayGenerator>,
3029 is_large: bool,
3030 ) -> Box<dyn ArrayGenerator> {
3031 Box::new(RandomListGenerator::new(item_gen, is_large))
3032 }
3033
3034 pub fn rand_map(key_type: &DataType, value_type: &DataType) -> Box<dyn ArrayGenerator> {
3036 let keys_gen = rand_type(key_type);
3037 let values_gen = rand_type(value_type);
3038 Box::new(RandomMapGenerator::new(keys_gen, values_gen))
3039 }
3040
3041 pub fn rand_struct(fields: Fields) -> Box<dyn ArrayGenerator> {
3042 let child_gens = fields
3043 .iter()
3044 .map(|f| rand_type(f.data_type()))
3045 .collect::<Vec<_>>();
3046 Box::new(RandomStructGenerator::new(fields, child_gens))
3047 }
3048
3049 pub fn null_type() -> Box<dyn ArrayGenerator> {
3050 Box::new(NullArrayGenerator {})
3051 }
3052
3053 pub fn rand_type(data_type: &DataType) -> Box<dyn ArrayGenerator> {
3055 match data_type {
3056 DataType::Boolean => rand_boolean(),
3057 DataType::Int8 => rand::<Int8Type>(),
3058 DataType::Int16 => rand::<Int16Type>(),
3059 DataType::Int32 => rand::<Int32Type>(),
3060 DataType::Int64 => rand::<Int64Type>(),
3061 DataType::UInt8 => rand::<UInt8Type>(),
3062 DataType::UInt16 => rand::<UInt16Type>(),
3063 DataType::UInt32 => rand::<UInt32Type>(),
3064 DataType::UInt64 => rand::<UInt64Type>(),
3065 DataType::Float16 => rand_primitive::<Float16Type>(data_type.clone()),
3066 DataType::Float32 => rand::<Float32Type>(),
3067 DataType::Float64 => rand::<Float64Type>(),
3068 DataType::Decimal128(_, _) => rand_primitive::<Decimal128Type>(data_type.clone()),
3069 DataType::Decimal256(_, _) => rand_primitive::<Decimal256Type>(data_type.clone()),
3070 DataType::Utf8 => rand_utf8(ByteCount::from(12), false),
3071 DataType::LargeUtf8 => rand_utf8(ByteCount::from(12), true),
3072 DataType::Binary => rand_fixedbin(ByteCount::from(12), false),
3073 DataType::LargeBinary => rand_fixedbin(ByteCount::from(12), true),
3074 DataType::Dictionary(key_type, value_type) => {
3075 dict_type(rand_type(value_type), key_type)
3076 }
3077 DataType::FixedSizeList(child, dimension) => cycle_vec(
3078 rand_type(child.data_type()),
3079 Dimension::from(*dimension as u32),
3080 ),
3081 DataType::FixedSizeBinary(size) => rand_fsb(*size),
3082 DataType::List(child) => rand_list(child.data_type(), false),
3083 DataType::LargeList(child) => rand_list(child.data_type(), true),
3084 DataType::Map(entries_field, _) => {
3085 let DataType::Struct(fields) = entries_field.data_type() else {
3086 panic!("Map entries field must be a struct");
3087 };
3088 let key_type = fields[0].data_type();
3089 let value_type = fields[1].data_type();
3090 rand_map(key_type, value_type)
3091 }
3092 DataType::Duration(unit) => match unit {
3093 TimeUnit::Second => rand::<DurationSecondType>(),
3094 TimeUnit::Millisecond => rand::<DurationMillisecondType>(),
3095 TimeUnit::Microsecond => rand::<DurationMicrosecondType>(),
3096 TimeUnit::Nanosecond => rand::<DurationNanosecondType>(),
3097 },
3098 DataType::Interval(unit) => rand_interval(*unit),
3099 DataType::Date32 => rand_date32(),
3100 DataType::Date64 => rand_date64(),
3101 DataType::Time32(resolution) => rand_time32(resolution),
3102 DataType::Time64(resolution) => rand_time64(resolution),
3103 DataType::Timestamp(_, _) => rand_timestamp(data_type),
3104 DataType::Struct(fields) => rand_struct(fields.clone()),
3105 DataType::Null => null_type(),
3106 _ => unimplemented!("random generation of {}", data_type),
3107 }
3108 }
3109
3110 pub fn dict<K: ArrowDictionaryKeyType + Send + Sync>(
3116 generator: Box<dyn ArrayGenerator>,
3117 ) -> Box<dyn ArrayGenerator> {
3118 Box::new(DictionaryGenerator::<K>::new(generator))
3119 }
3120
3121 pub fn dict_type(
3123 generator: Box<dyn ArrayGenerator>,
3124 key_type: &DataType,
3125 ) -> Box<dyn ArrayGenerator> {
3126 match key_type {
3127 DataType::Int8 => dict::<Int8Type>(generator),
3128 DataType::Int16 => dict::<Int16Type>(generator),
3129 DataType::Int32 => dict::<Int32Type>(generator),
3130 DataType::Int64 => dict::<Int64Type>(generator),
3131 DataType::UInt8 => dict::<UInt8Type>(generator),
3132 DataType::UInt16 => dict::<UInt16Type>(generator),
3133 DataType::UInt32 => dict::<UInt32Type>(generator),
3134 DataType::UInt64 => dict::<UInt64Type>(generator),
3135 _ => unimplemented!(),
3136 }
3137 }
3138
3139 pub fn low_cardinality(
3144 generator: Box<dyn ArrayGenerator>,
3145 cardinality: usize,
3146 ) -> Box<dyn ArrayGenerator> {
3147 Box::new(LowCardinalityGenerator::new(generator, cardinality))
3148 }
3149}
3150
3151pub fn gen_batch() -> BatchGeneratorBuilder {
3153 BatchGeneratorBuilder::default()
3154}
3155
3156pub fn gen_array(genn: Box<dyn ArrayGenerator>) -> ArrayGeneratorBuilder {
3158 ArrayGeneratorBuilder::new(genn)
3159}
3160
3161pub const CONTENT_TYPE_KEY: &str = "lance-datagen:content-type";
3164
3165pub const CARDINALITY_KEY: &str = "lance-datagen:cardinality";
3168
3169pub fn rand_field(field: &Field) -> Box<dyn ArrayGenerator> {
3178 let mut generator = if let Some(content_type) = field.metadata().get(CONTENT_TYPE_KEY) {
3179 match (content_type.as_str(), field.data_type()) {
3180 ("sentence", DataType::Utf8) => array::random_sentence(1, 10, false),
3181 ("sentence", DataType::LargeUtf8) => array::random_sentence(1, 10, true),
3182 _ => array::rand_type(field.data_type()),
3183 }
3184 } else {
3185 array::rand_type(field.data_type())
3186 };
3187
3188 if let Some(cardinality_str) = field.metadata().get(CARDINALITY_KEY)
3189 && let Ok(cardinality) = cardinality_str.parse::<usize>()
3190 && cardinality > 0
3191 {
3192 generator = array::low_cardinality(generator, cardinality);
3193 }
3194
3195 generator
3196}
3197
3198pub fn rand(schema: &Schema) -> BatchGeneratorBuilder {
3207 let mut builder = BatchGeneratorBuilder::default();
3208 for field in schema.fields() {
3209 builder = builder.col(field.name(), rand_field(field));
3210 }
3211 builder
3212}
3213
3214#[cfg(test)]
3215mod tests {
3216
3217 use arrow::datatypes::{Float32Type, Int8Type, Int16Type, UInt32Type};
3218 use arrow_array::{BooleanArray, Float32Array, Int8Array, Int16Array, Int32Array, UInt32Array};
3219
3220 use super::*;
3221
3222 #[test]
3223 fn test_step() {
3224 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3225 let mut genn = array::step::<Int32Type>();
3226 assert_eq!(
3227 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3228 Int32Array::from_iter([0, 1, 2, 3, 4])
3229 );
3230 assert_eq!(
3231 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3232 Int32Array::from_iter([5, 6, 7, 8, 9])
3233 );
3234
3235 let mut genn = array::step::<Int8Type>();
3236 assert_eq!(
3237 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3238 Int8Array::from_iter([0, 1, 2])
3239 );
3240
3241 let mut genn = array::step::<Float32Type>();
3242 assert_eq!(
3243 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3244 Float32Array::from_iter([0.0, 1.0, 2.0])
3245 );
3246
3247 let mut genn = array::step_custom::<Int16Type>(4, 8);
3248 assert_eq!(
3249 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3250 Int16Array::from_iter([4, 12, 20])
3251 );
3252 assert_eq!(
3253 *genn.generate(RowCount::from(2), &mut rng).unwrap(),
3254 Int16Array::from_iter([28, 36])
3255 );
3256 }
3257
3258 #[test]
3259 fn test_cycle() {
3260 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3261 let mut genn = array::cycle::<Int32Type>(vec![1, 2, 3]);
3262 assert_eq!(
3263 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3264 Int32Array::from_iter([1, 2, 3, 1, 2])
3265 );
3266
3267 let mut genn = array::cycle_utf8_literals(&["abc", "def", "xyz"]);
3268 assert_eq!(
3269 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3270 StringArray::from_iter_values(["abc", "def", "xyz", "abc", "def"])
3271 );
3272 assert_eq!(
3273 *genn.generate(RowCount::from(1), &mut rng).unwrap(),
3274 StringArray::from_iter_values(["xyz"])
3275 );
3276
3277 let mut genn = array::cycle_bool(vec![false, false, true]);
3278 assert_eq!(
3279 *genn.generate(RowCount::from(5), &mut rng).unwrap(),
3280 BooleanArray::from_iter(vec![false, false, true, false, false].into_iter().map(Some))
3281 );
3282 assert_eq!(
3283 *genn.generate(RowCount::from(1), &mut rng).unwrap(),
3284 BooleanArray::from_iter(vec![Some(true)])
3285 )
3286 }
3287
3288 #[test]
3289 fn test_fill() {
3290 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3291 let mut genn = array::fill::<Int32Type>(42);
3292 assert_eq!(
3293 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3294 Int32Array::from_iter([42, 42, 42])
3295 );
3296 assert_eq!(
3297 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3298 Int32Array::from_iter([42, 42, 42])
3299 );
3300
3301 let mut genn = array::fill_varbin(vec![0, 1, 2]);
3302 assert_eq!(
3303 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3304 arrow_array::BinaryArray::from_iter_values([
3305 "\x00\x01\x02",
3306 "\x00\x01\x02",
3307 "\x00\x01\x02"
3308 ])
3309 );
3310
3311 let mut genn = array::fill_utf8("xyz".to_string());
3312 assert_eq!(
3313 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3314 arrow_array::StringArray::from_iter_values(["xyz", "xyz", "xyz"])
3315 );
3316 }
3317
3318 #[test]
3319 fn test_utf8_prefix_plus_counter() {
3320 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3321 let mut genn = array::utf8_prefix_plus_counter("user_", false);
3322 assert_eq!(
3323 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3324 arrow_array::StringArray::from_iter_values(["user_0", "user_1", "user_2"])
3325 );
3326
3327 let mut genn = array::utf8_prefix_plus_counter("user_", true);
3328 assert_eq!(
3329 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3330 arrow_array::LargeStringArray::from_iter_values(["user_0", "user_1", "user_2"])
3331 );
3332 }
3333
3334 #[test]
3335 fn test_rng() {
3336 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3338 let mut genn = array::rand::<Int32Type>();
3339 assert_eq!(
3340 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3341 Int32Array::from_iter([-797553329, 1369325940, -69174021])
3342 );
3343
3344 let mut genn = array::rand_fixedbin(ByteCount::from(3), false);
3345 assert_eq!(
3346 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3347 arrow_array::BinaryArray::from_iter_values([
3348 [184, 53, 216],
3349 [12, 96, 159],
3350 [125, 179, 56]
3351 ])
3352 );
3353
3354 let mut genn = array::rand_utf8(ByteCount::from(3), false);
3355 assert_eq!(
3356 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3357 arrow_array::StringArray::from_iter_values([">@p", "n `", "NWa"])
3358 );
3359
3360 let mut genn = array::random_sentence(1, 5, false);
3361 let words = genn.generate(RowCount::from(10), &mut rng).unwrap();
3362 assert_eq!(words.data_type(), &DataType::Utf8);
3363 let words_array = words.as_any().downcast_ref::<StringArray>().unwrap();
3364 for i in 0..10 {
3366 let sentence = words_array.value(i);
3367 let word_count = sentence.split_whitespace().count();
3368 assert!((1..=5).contains(&word_count));
3369 }
3370
3371 let mut genn = array::rand_date32();
3372 let days_32 = genn.generate(RowCount::from(3), &mut rng).unwrap();
3373 assert_eq!(days_32.data_type(), &DataType::Date32);
3374
3375 let mut genn = array::rand_date64();
3376 let days_64 = genn.generate(RowCount::from(3), &mut rng).unwrap();
3377 assert_eq!(days_64.data_type(), &DataType::Date64);
3378
3379 let mut genn = array::rand_boolean();
3380 let bools = genn.generate(RowCount::from(1024), &mut rng).unwrap();
3381 assert_eq!(bools.data_type(), &DataType::Boolean);
3382 let bools = bools.as_any().downcast_ref::<BooleanArray>().unwrap();
3383 assert!(bools.false_count() > 100);
3385 assert!(bools.true_count() > 100);
3386
3387 let mut genn = array::rand_varbin(ByteCount::from(2), ByteCount::from(4));
3388 assert_eq!(
3389 *genn.generate(RowCount::from(3), &mut rng).unwrap(),
3390 arrow_array::BinaryArray::from_iter_values([
3391 vec![111, 9, 80],
3392 vec![86, 118, 13, 209],
3393 vec![68, 33, 202]
3394 ])
3395 );
3396 }
3397
3398 #[test]
3399 fn test_rng_list() {
3400 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3402 let mut genn = array::rand_list(&DataType::Int32, false);
3403 let arr = genn.generate(RowCount::from(100), &mut rng).unwrap();
3404 let arr = arr.as_list::<i32>();
3406 assert!(arr.iter().any(|l| l.unwrap().is_empty()));
3407 assert!(arr.iter().any(|l| l.unwrap().len() < 11));
3409 }
3410
3411 #[test]
3412 fn test_rng_distribution() {
3413 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3417 let mut genn = array::rand::<UInt32Type>();
3418 for _ in 0..10 {
3419 let arr = genn.generate(RowCount::from(10000), &mut rng).unwrap();
3420 let int_arr = arr.as_any().downcast_ref::<UInt32Array>().unwrap();
3421 let mut buckets = vec![0_u32; 256];
3422 for val in int_arr.values() {
3423 buckets[(*val >> 24) as usize] += 1;
3424 }
3425 for bucket in buckets {
3426 assert!(bucket > 15);
3429 }
3430 }
3431 }
3432
3433 #[test]
3434 fn test_nulls() {
3435 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3436 let mut genn = array::rand::<Int32Type>().with_random_nulls(0.3);
3437
3438 let arr = genn.generate(RowCount::from(1000), &mut rng).unwrap();
3439
3440 assert_eq!(arr.null_count(), 297);
3442
3443 for len in 0..100 {
3444 let arr = genn.generate(RowCount::from(len), &mut rng).unwrap();
3445 assert_eq!(
3447 arr.null_count(),
3448 arr.nulls()
3449 .map(|nulls| (len as usize)
3450 - nulls.buffer().count_set_bits_offset(0, len as usize))
3451 .unwrap_or(0)
3452 );
3453 }
3454
3455 let mut genn = array::rand::<Int32Type>().with_random_nulls(0.0);
3456 let arr = genn.generate(RowCount::from(10), &mut rng).unwrap();
3457
3458 assert_eq!(arr.null_count(), 0);
3459
3460 let mut genn = array::rand::<Int32Type>().with_random_nulls(1.0);
3461 let arr = genn.generate(RowCount::from(10), &mut rng).unwrap();
3462
3463 assert_eq!(arr.null_count(), 10);
3464 assert!((0..10).all(|idx| arr.is_null(idx)));
3465
3466 let mut genn = array::rand::<Int32Type>().with_nulls(&[false, false, true]);
3467 let arr = genn.generate(RowCount::from(7), &mut rng).unwrap();
3468 assert!((0..2).all(|idx| arr.is_valid(idx)));
3469 assert!(arr.is_null(2));
3470 assert!((3..5).all(|idx| arr.is_valid(idx)));
3471 assert!(arr.is_null(5));
3472 assert!(arr.is_valid(6));
3473 }
3474
3475 #[test]
3476 fn test_unit_circle() {
3477 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3478 let mut genn = array::cycle_unit_circle(4);
3479 let arr = genn.generate(RowCount::from(6), &mut rng).unwrap();
3480
3481 let arr_values = arr
3482 .as_fixed_size_list()
3483 .values()
3484 .as_primitive::<Float32Type>()
3485 .values()
3486 .to_vec();
3487 assert_eq!(arr_values.len(), 12);
3488 let expected_values = [1.0, 0.0, 0.0, 1.0, -1.0, 0.0, 0.0, -1.0, 1.0, 0.0, 0.0, 1.0];
3489 for (actual, expected) in arr_values.iter().zip(expected_values.iter()) {
3490 assert!((actual - expected).abs() < 0.0001);
3491 }
3492 }
3493
3494 #[test]
3495 fn test_jitter_centroids() {
3496 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
3497 let mut centroids_gen = array::cycle_unit_circle(4);
3498 let centroids = centroids_gen.generate(RowCount::from(4), &mut rng).unwrap();
3499
3500 let centroid_values = centroids
3501 .as_fixed_size_list()
3502 .values()
3503 .as_primitive::<Float32Type>()
3504 .values()
3505 .to_vec();
3506
3507 let mut jitter_jen = array::jitter_centroids(centroids, 0.001);
3508 let jittered = jitter_jen.generate(RowCount::from(100), &mut rng).unwrap();
3509
3510 let values = jittered
3511 .as_fixed_size_list()
3512 .values()
3513 .as_primitive::<Float32Type>()
3514 .values()
3515 .to_vec();
3516
3517 for i in 0..100 {
3518 let centroid = i % 4;
3519 let centroid_x = centroid_values[centroid * 2];
3520 let centroid_y = centroid_values[centroid * 2 + 1];
3521 let value_x = values[i * 2];
3522 let value_y = values[i * 2 + 1];
3523
3524 let l2_dist = ((value_x - centroid_x).powi(2) + (value_y - centroid_y).powi(2)).sqrt();
3525 assert!(l2_dist < 0.001001);
3526 assert!(l2_dist > 0.000999);
3527 }
3528 }
3529
3530 #[test]
3531 fn test_rand_schema() {
3532 let schema = Schema::new(vec![
3533 Field::new("a", DataType::Int32, true),
3534 Field::new("b", DataType::Utf8, true),
3535 Field::new("c", DataType::Float32, true),
3536 Field::new("d", DataType::Int32, true),
3537 Field::new("e", DataType::Int32, true),
3538 ]);
3539 let rbr = rand(&schema)
3540 .into_reader_bytes(
3541 ByteCount::from(1024 * 1024),
3542 BatchCount::from(8),
3543 RoundingBehavior::ExactOrErr,
3544 )
3545 .unwrap();
3546 assert_eq!(*rbr.schema(), schema);
3547
3548 let batches = rbr.map(|val| val.unwrap()).collect::<Vec<_>>();
3549 assert_eq!(batches.len(), 8);
3550
3551 for batch in batches {
3552 assert_eq!(batch.num_rows(), 1024 * 1024 / 32);
3553 assert_eq!(batch.num_columns(), 5);
3554 }
3555 }
3556}