sample_arrow2/
chunk.rs

1//! Chained samplers for generating arbitrary `Chunk<Box<dyn Array>>` arrow chunks.
2
3use std::ops::Range;
4
5use arrow2::{array::Array, chunk::Chunk, datatypes::DataType};
6use sample_std::{sample_all, Chained, Sample, VecSampler};
7
8use crate::{array::ArbitraryArray, datatypes::DataTypeSampler};
9
10pub type ChainedChunk = Chained<(Vec<DataType>, usize), Chunk<Box<dyn Array>>>;
11pub type ChunkSampler = Box<dyn Sample<Output = ChainedChunk> + Send + Sync>;
12
13pub type ChainedMultiChunk = Chained<(Vec<DataType>, Vec<usize>), Vec<Chunk<Box<dyn Array>>>>;
14pub type MultiChunkSampler = Box<dyn Sample<Output = ChainedMultiChunk> + Send + Sync>;
15
16pub struct ArbitraryChunk<N, V> {
17    pub chunk_len: Range<usize>,
18    pub array_count: Range<usize>,
19    pub data_type: DataTypeSampler,
20    pub array: ArbitraryArray<N, V>,
21}
22
23impl<N, V> ArbitraryChunk<N, V>
24where
25    N: Sample<Output = String> + Send + Sync + Clone + 'static,
26    V: Sample<Output = bool> + Send + Sync + Clone + 'static,
27{
28    pub fn sample_one(self) -> ChunkSampler {
29        Box::new(
30            VecSampler {
31                length: self.array_count,
32                el: self.data_type,
33            }
34            .zip(self.chunk_len)
35            .chain_resample(move |seed| Self::from_seed(&self.array, seed), 100),
36        )
37    }
38
39    pub fn sample_many(self, chunk_count: Range<usize>) -> MultiChunkSampler {
40        Box::new(
41            VecSampler {
42                length: self.array_count,
43                el: self.data_type,
44            }
45            .zip(VecSampler {
46                length: chunk_count,
47                el: self.chunk_len,
48            })
49            .chain_resample(
50                move |(dts, lens)| {
51                    sample_all(
52                        lens.into_iter()
53                            .map(|len| Self::from_seed(&self.array, (dts.clone(), len)))
54                            .collect(),
55                    )
56                },
57                100,
58            ),
59        )
60    }
61
62    pub fn from_seed(
63        array: &ArbitraryArray<N, V>,
64        seed: (Vec<DataType>, usize),
65    ) -> Box<dyn Sample<Output = Chunk<Box<dyn Array>>> + Send + Sync> {
66        let (dts, len) = seed;
67        Box::new(
68            sample_all(
69                dts.into_iter()
70                    .map(|data_type| array.with_len(len).sampler_from_data_type(&data_type))
71                    .collect(),
72            )
73            .try_convert(Chunk::new, |chunk| Some(chunk.to_vec())),
74        )
75    }
76}