lance_encoding/
statistics.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{
5    fmt::{self},
6    hash::{Hash, RandomState},
7    sync::Arc,
8};
9
10use arrow::{array::AsArray, datatypes::UInt64Type};
11use arrow_array::{Array, ArrowPrimitiveType, UInt64Array};
12use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
13use num_traits::PrimInt;
14
15use crate::data::{
16    AllNullDataBlock, DataBlock, DictionaryDataBlock, FixedSizeListBlock, FixedWidthDataBlock,
17    NullableDataBlock, OpaqueBlock, StructDataBlock, VariableWidthBlock,
18};
19
20#[derive(Clone, Copy, PartialEq, Eq, Hash)]
21pub enum Stat {
22    BitWidth,
23    DataSize,
24    Cardinality,
25    FixedSize,
26    NullCount,
27    MaxLength,
28}
29
30impl fmt::Debug for Stat {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        match self {
33            Self::BitWidth => write!(f, "BitWidth"),
34            Self::DataSize => write!(f, "DataSize"),
35            Self::Cardinality => write!(f, "Cardinality"),
36            Self::FixedSize => write!(f, "FixedSize"),
37            Self::NullCount => write!(f, "NullCount"),
38            Self::MaxLength => write!(f, "MaxLength"),
39        }
40    }
41}
42
43impl fmt::Display for Stat {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        write!(f, "{:?}", self)
46    }
47}
48
49pub trait ComputeStat {
50    fn compute_stat(&mut self);
51}
52
53impl ComputeStat for DataBlock {
54    fn compute_stat(&mut self) {
55        match self {
56            Self::Empty() => {}
57            Self::Constant(_) => {}
58            Self::AllNull(_) => {}
59            Self::Nullable(data_block) => data_block.data.compute_stat(),
60            Self::FixedWidth(data_block) => data_block.compute_stat(),
61            Self::FixedSizeList(data_block) => data_block.compute_stat(),
62            Self::VariableWidth(data_block) => data_block.compute_stat(),
63            Self::Opaque(data_block) => data_block.compute_stat(),
64            Self::Struct(data_block) => data_block.compute_stat(),
65            Self::Dictionary(_) => {}
66        }
67    }
68}
69
70impl ComputeStat for VariableWidthBlock {
71    fn compute_stat(&mut self) {
72        if !self.block_info.0.read().unwrap().is_empty() {
73            panic!("compute_stat should only be called once during DataBlock construction");
74        }
75        let data_size = self.data_size();
76        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
77
78        let cardinality_array = self.cardinality();
79
80        let max_length_array = self.max_length();
81
82        let mut info = self.block_info.0.write().unwrap();
83        info.insert(Stat::DataSize, data_size_array);
84        info.insert(Stat::Cardinality, cardinality_array);
85        info.insert(Stat::MaxLength, max_length_array);
86    }
87}
88
89impl ComputeStat for FixedWidthDataBlock {
90    fn compute_stat(&mut self) {
91        // compute this datablock's data_size
92        let data_size = self.data_size();
93        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
94
95        // compute this datablock's max_bit_width
96        let max_bit_widths = self.max_bit_widths();
97
98        // the MaxLength of FixedWidthDataBlock is it's self.bits_per_value / 8
99        let max_len = self.bits_per_value / 8;
100        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
101
102        let cardidinality_array = if self.bits_per_value == 128 {
103            Some(self.cardinality())
104        } else {
105            None
106        };
107
108        let mut info = self.block_info.0.write().unwrap();
109        info.insert(Stat::DataSize, data_size_array);
110        info.insert(Stat::BitWidth, max_bit_widths);
111        info.insert(Stat::MaxLength, max_len_array);
112        if let Some(cardinality_array) = cardidinality_array {
113            info.insert(Stat::Cardinality, cardinality_array);
114        }
115    }
116}
117
118impl ComputeStat for FixedSizeListBlock {
119    fn compute_stat(&mut self) {
120        // We leave the child stats unchanged.  This may seem odd (e.g. should bit width be the
121        // bit width of the child * dimension?) but it's because we use these stats to determine
122        // compression and we are currently just compressing the child data.
123        //
124        // There is a potential opportunity here to do better.  For example, if we have a FSL of
125        // 4 32-bit integers then we should probably treat them as a single 128-bit integer or maybe
126        // even 4 columns of 32-bit integers.  This might yield better compression.
127        self.child.compute_stat();
128    }
129}
130
131impl ComputeStat for OpaqueBlock {
132    fn compute_stat(&mut self) {
133        // compute this datablock's data_size
134        let data_size = self.data_size();
135        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
136        let mut info = self.block_info.0.write().unwrap();
137        info.insert(Stat::DataSize, data_size_array);
138    }
139}
140
141pub trait GetStat: fmt::Debug {
142    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>>;
143
144    fn expect_stat(&self, stat: Stat) -> Arc<dyn Array> {
145        self.get_stat(stat)
146            .unwrap_or_else(|| panic!("{:?} DataBlock does not have `{}` statistics.", self, stat))
147    }
148
149    fn expect_single_stat<T: ArrowPrimitiveType>(&self, stat: Stat) -> T::Native {
150        let stat_value = self.expect_stat(stat);
151        let stat_value = stat_value.as_primitive::<T>();
152        if stat_value.len() != 1 {
153            panic!(
154                "{:?} DataBlock does not have exactly one value for `{} statistics.",
155                self, stat
156            );
157        }
158        stat_value.value(0)
159    }
160}
161
162impl GetStat for DataBlock {
163    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
164        match self {
165            Self::Empty() => None,
166            Self::Constant(_) => None,
167            Self::AllNull(data_block) => data_block.get_stat(stat),
168            Self::Nullable(data_block) => data_block.get_stat(stat),
169            Self::FixedWidth(data_block) => data_block.get_stat(stat),
170            Self::FixedSizeList(data_block) => data_block.get_stat(stat),
171            Self::VariableWidth(data_block) => data_block.get_stat(stat),
172            Self::Opaque(data_block) => data_block.get_stat(stat),
173            Self::Struct(data_block) => data_block.get_stat(stat),
174            Self::Dictionary(data_block) => data_block.get_stat(stat),
175        }
176    }
177}
178
179// NullableDataBlock will be deprecated in Lance 2.1.
180impl GetStat for NullableDataBlock {
181    // This function simply returns the statistics of the inner `DataBlock` of `NullableDataBlock`,
182    // this is not accurate but `NullableDataBlock` is going to be deprecated in Lance 2.1 anyway.
183    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
184        self.data.get_stat(stat)
185    }
186}
187
188impl GetStat for VariableWidthBlock {
189    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
190        let block_info = self.block_info.0.read().unwrap();
191
192        if block_info.is_empty() {
193            panic!("get_stat should be called after statistics are computed.");
194        }
195        block_info.get(&stat).cloned()
196    }
197}
198
199impl GetStat for FixedSizeListBlock {
200    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
201        let child_stat = self.child.get_stat(stat);
202        match stat {
203            Stat::MaxLength => child_stat.map(|max_length| {
204                // this is conservative when working with variable length data as we shouldn't assume
205                // that we have a list of all max-length elements but it's cheap and easy to calculate
206                let max_length = max_length.as_primitive::<UInt64Type>().value(0);
207                Arc::new(UInt64Array::from(vec![max_length * self.dimension])) as Arc<dyn Array>
208            }),
209            _ => child_stat,
210        }
211    }
212}
213
214impl VariableWidthBlock {
215    // Caveat: the computation here assumes VariableWidthBlock.offsets maps directly to VariableWidthBlock.data
216    // without any adjustment(for example, no null_adjustment for offsets)
217    fn cardinality(&mut self) -> Arc<dyn Array> {
218        const PRECISION: u8 = 4;
219        // The default hasher (currently sip hash 1-3) does not seem to give good results
220        // with HLL.
221        //
222        // In particular, when using randomly generated 12-byte strings, the HLL count was
223        // suggested a cardinality of 500 (out of 1000 unique items and hashes) at least 10%
224        // of the time.
225        //
226        // Using xxhash3 consistently gives better results.
227        let mut hll: HyperLogLogPlus<&[u8], xxhash_rust::xxh3::Xxh3Builder> =
228            HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()).unwrap();
229
230        match self.bits_per_offset {
231            32 => {
232                let offsets_ref = self.offsets.borrow_to_typed_slice::<u32>();
233                let offsets: &[u32] = offsets_ref.as_ref();
234
235                offsets
236                    .iter()
237                    .zip(offsets.iter().skip(1))
238                    .for_each(|(&start, &end)| {
239                        hll.insert(&self.data[start as usize..end as usize]);
240                    });
241                let cardinality = hll.count() as u64;
242                Arc::new(UInt64Array::from(vec![cardinality]))
243            }
244            64 => {
245                let offsets_ref = self.offsets.borrow_to_typed_slice::<u64>();
246                let offsets: &[u64] = offsets_ref.as_ref();
247
248                offsets
249                    .iter()
250                    .zip(offsets.iter().skip(1))
251                    .for_each(|(&start, &end)| {
252                        hll.insert(&self.data[start as usize..end as usize]);
253                    });
254
255                let cardinality = hll.count() as u64;
256                Arc::new(UInt64Array::from(vec![cardinality]))
257            }
258            _ => {
259                unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64")
260            }
261        }
262    }
263
264    fn max_length(&mut self) -> Arc<dyn Array> {
265        match self.bits_per_offset {
266            32 => {
267                let offsets = self.offsets.borrow_to_typed_slice::<u32>();
268                let offsets = offsets.as_ref();
269                let max_len = offsets
270                    .windows(2)
271                    .map(|pair| pair[1] - pair[0])
272                    .max()
273                    .unwrap_or(0);
274                Arc::new(UInt64Array::from(vec![max_len as u64]))
275            }
276            64 => {
277                let offsets = self.offsets.borrow_to_typed_slice::<u64>();
278                let offsets = offsets.as_ref();
279                let max_len = offsets
280                    .windows(2)
281                    .map(|pair| pair[1] - pair[0])
282                    .max()
283                    .unwrap_or(0);
284                Arc::new(UInt64Array::from(vec![max_len]))
285            }
286            _ => {
287                unreachable!("the type of offsets in VariableWidth can only be u32 or u64");
288            }
289        }
290    }
291}
292
293impl GetStat for AllNullDataBlock {
294    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
295        match stat {
296            Stat::NullCount => {
297                let null_count = self.num_values;
298                Some(Arc::new(UInt64Array::from(vec![null_count])))
299            }
300            Stat::DataSize => Some(Arc::new(UInt64Array::from(vec![0]))),
301            _ => None,
302        }
303    }
304}
305
306impl GetStat for FixedWidthDataBlock {
307    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
308        let block_info = self.block_info.0.read().unwrap();
309
310        if block_info.is_empty() {
311            panic!("get_stat should be called after statistics are computed.");
312        }
313        block_info.get(&stat).cloned()
314    }
315}
316
317impl FixedWidthDataBlock {
318    fn max_bit_widths(&mut self) -> Arc<dyn Array> {
319        assert!(self.num_values > 0);
320
321        const CHUNK_SIZE: usize = 1024;
322
323        fn calculate_max_bit_width<T: PrimInt>(slice: &[T], bits_per_value: u64) -> Vec<u64> {
324            slice
325                .chunks(CHUNK_SIZE)
326                .map(|chunk| {
327                    let max_value = chunk.iter().fold(T::zero(), |acc, &x| acc | x);
328                    bits_per_value - max_value.leading_zeros() as u64
329                })
330                .collect()
331        }
332
333        match self.bits_per_value {
334            8 => {
335                let u8_slice = self.data.borrow_to_typed_slice::<u8>();
336                let u8_slice = u8_slice.as_ref();
337                Arc::new(UInt64Array::from(calculate_max_bit_width(
338                    u8_slice,
339                    self.bits_per_value,
340                )))
341            }
342            16 => {
343                let u16_slice = self.data.borrow_to_typed_slice::<u16>();
344                let u16_slice = u16_slice.as_ref();
345                Arc::new(UInt64Array::from(calculate_max_bit_width(
346                    u16_slice,
347                    self.bits_per_value,
348                )))
349            }
350            32 => {
351                let u32_slice = self.data.borrow_to_typed_slice::<u32>();
352                let u32_slice = u32_slice.as_ref();
353                Arc::new(UInt64Array::from(calculate_max_bit_width(
354                    u32_slice,
355                    self.bits_per_value,
356                )))
357            }
358            64 => {
359                let u64_slice = self.data.borrow_to_typed_slice::<u64>();
360                let u64_slice = u64_slice.as_ref();
361                Arc::new(UInt64Array::from(calculate_max_bit_width(
362                    u64_slice,
363                    self.bits_per_value,
364                )))
365            }
366            _ => Arc::new(UInt64Array::from(vec![self.bits_per_value])),
367        }
368    }
369
370    fn cardinality(&mut self) -> Arc<dyn Array> {
371        match self.bits_per_value {
372            128 => {
373                let u128_slice_ref = self.data.borrow_to_typed_slice::<u128>();
374                let u128_slice = u128_slice_ref.as_ref();
375
376                const PRECISION: u8 = 4;
377                let mut hll: HyperLogLogPlus<u128, RandomState> =
378                    HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
379                for val in u128_slice {
380                    hll.insert(val);
381                }
382                let cardinality = hll.count() as u64;
383                Arc::new(UInt64Array::from(vec![cardinality]))
384            }
385            _ => unreachable!(),
386        }
387    }
388}
389
390impl GetStat for OpaqueBlock {
391    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
392        let block_info = self.block_info.0.read().unwrap();
393
394        if block_info.is_empty() {
395            panic!("get_stat should be called after statistics are computed.");
396        }
397        block_info.get(&stat).cloned()
398    }
399}
400
401impl GetStat for DictionaryDataBlock {
402    fn get_stat(&self, _stat: Stat) -> Option<Arc<dyn Array>> {
403        None
404    }
405}
406
407impl GetStat for StructDataBlock {
408    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
409        let block_info = self.block_info.0.read().unwrap();
410        if block_info.is_empty() {
411            panic!("get_stat should be called after statistics are computed.")
412        }
413        block_info.get(&stat).cloned()
414    }
415}
416
417impl ComputeStat for StructDataBlock {
418    fn compute_stat(&mut self) {
419        let data_size = self.data_size();
420        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
421
422        let max_len = self
423            .children
424            .iter()
425            .map(|child| child.expect_single_stat::<UInt64Type>(Stat::MaxLength))
426            .sum::<u64>();
427        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
428
429        let mut info = self.block_info.0.write().unwrap();
430        info.insert(Stat::DataSize, data_size_array);
431        info.insert(Stat::MaxLength, max_len_array);
432    }
433}
434
435#[cfg(test)]
436mod tests {
437    use std::sync::Arc;
438
439    use arrow_array::{
440        ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
441        UInt16Array, UInt32Array, UInt64Array, UInt8Array,
442    };
443    use arrow_schema::{DataType, Field};
444    use lance_arrow::DataTypeExt;
445    use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED};
446    use rand::SeedableRng;
447
448    use crate::statistics::{GetStat, Stat};
449
450    use super::DataBlock;
451
452    use arrow::{
453        array::AsArray,
454        compute::concat,
455        datatypes::{Int32Type, UInt64Type},
456    };
457    use arrow_array::Array;
458    #[test]
459    fn test_data_size_stat() {
460        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
461        let mut gen = array::rand::<Int32Type>().with_nulls(&[false, false, false]);
462        let arr1 = gen.generate(RowCount::from(3), &mut rng).unwrap();
463        let arr2 = gen.generate(RowCount::from(3), &mut rng).unwrap();
464        let arr3 = gen.generate(RowCount::from(3), &mut rng).unwrap();
465        let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9);
466
467        let concatenated_array = concat(&[
468            &*Arc::new(arr1.clone()) as &dyn Array,
469            &*Arc::new(arr2.clone()) as &dyn Array,
470            &*Arc::new(arr3.clone()) as &dyn Array,
471        ])
472        .unwrap();
473
474        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
475
476        let total_buffer_size: usize = concatenated_array
477            .to_data()
478            .buffers()
479            .iter()
480            .map(|buffer| buffer.len())
481            .sum();
482        assert!(data_size == total_buffer_size as u64);
483
484        // test DataType::Binary
485        let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
486        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
487        let block = DataBlock::from_array(arr.clone());
488        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
489
490        let total_buffer_size: usize = arr
491            .to_data()
492            .buffers()
493            .iter()
494            .map(|buffer| buffer.len())
495            .sum();
496        assert!(data_size == total_buffer_size as u64);
497
498        // test DataType::Struct
499        let fields = vec![
500            Arc::new(Field::new("int_field", DataType::Int32, false)),
501            Arc::new(Field::new("float_field", DataType::Float32, false)),
502        ]
503        .into();
504
505        let mut gen = lance_datagen::array::rand_type(&DataType::Struct(fields));
506        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
507        let block = DataBlock::from_array(arr.clone());
508        let (_, arr_parts, _) = arr.as_struct().clone().into_parts();
509        let total_buffer_size: usize = arr_parts
510            .iter()
511            .map(|arr| {
512                arr.to_data()
513                    .buffers()
514                    .iter()
515                    .map(|buffer| buffer.len())
516                    .sum::<usize>()
517            })
518            .sum();
519        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
520        assert!(data_size == total_buffer_size as u64);
521
522        // test DataType::Dictionary
523        let mut gen = array::rand_type(&DataType::Dictionary(
524            Box::new(DataType::Int32),
525            Box::new(DataType::Utf8),
526        ));
527        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
528        let block = DataBlock::from_array(arr.clone());
529        assert!(block.get_stat(Stat::DataSize).is_none());
530
531        let mut gen = array::rand::<Int32Type>().with_nulls(&[false, true, false]);
532        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
533        let block = DataBlock::from_array(arr.clone());
534        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
535        let total_buffer_size: usize = arr
536            .to_data()
537            .buffers()
538            .iter()
539            .map(|buffer| buffer.len())
540            .sum();
541
542        assert!(data_size == total_buffer_size as u64);
543    }
544
545    #[test]
546    fn test_bit_width_stat_for_integers() {
547        let int8_array = Int8Array::from(vec![1, 2, 3]);
548        let array_ref: ArrayRef = Arc::new(int8_array);
549        let block = DataBlock::from_array(array_ref);
550
551        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
552        let actual_bit_width = block.expect_stat(Stat::BitWidth);
553
554        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
555
556        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
557        let array_ref: ArrayRef = Arc::new(int8_array);
558        let block = DataBlock::from_array(array_ref);
559
560        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
561        let actual_bit_width = block.expect_stat(Stat::BitWidth);
562        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
563
564        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
565        let array_ref: ArrayRef = Arc::new(int8_array);
566        let block = DataBlock::from_array(array_ref);
567
568        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
569        let actual_bit_width = block.expect_stat(Stat::BitWidth);
570        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
571
572        let int8_array = Int8Array::from(vec![-1, 2, 3]);
573        let array_ref: ArrayRef = Arc::new(int8_array);
574        let block = DataBlock::from_array(array_ref);
575
576        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
577        let actual_bit_width = block.expect_stat(Stat::BitWidth);
578        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
579
580        let int16_array = Int16Array::from(vec![1, 2, 3]);
581        let array_ref: ArrayRef = Arc::new(int16_array);
582        let block = DataBlock::from_array(array_ref);
583
584        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
585        let actual_bit_width = block.expect_stat(Stat::BitWidth);
586        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
587
588        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
589        let array_ref: ArrayRef = Arc::new(int16_array);
590        let block = DataBlock::from_array(array_ref);
591
592        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
593        let actual_bit_width = block.expect_stat(Stat::BitWidth);
594        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
595
596        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
597        let array_ref: ArrayRef = Arc::new(int16_array);
598        let block = DataBlock::from_array(array_ref);
599
600        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
601        let actual_bit_width = block.expect_stat(Stat::BitWidth);
602        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
603
604        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
605        let array_ref: ArrayRef = Arc::new(int16_array);
606        let block = DataBlock::from_array(array_ref);
607
608        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
609        let actual_bit_width = block.expect_stat(Stat::BitWidth);
610        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
611
612        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
613        let array_ref: ArrayRef = Arc::new(int16_array);
614        let block = DataBlock::from_array(array_ref);
615
616        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
617        let actual_bit_width = block.expect_stat(Stat::BitWidth);
618        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
619
620        let int16_array = Int16Array::from(vec![-1, 2, 3]);
621        let array_ref: ArrayRef = Arc::new(int16_array);
622        let block = DataBlock::from_array(array_ref);
623
624        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
625        let actual_bit_width = block.expect_stat(Stat::BitWidth);
626        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
627
628        let int32_array = Int32Array::from(vec![1, 2, 3]);
629        let array_ref: ArrayRef = Arc::new(int32_array);
630        let block = DataBlock::from_array(array_ref);
631
632        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
633        let actual_bit_width = block.expect_stat(Stat::BitWidth);
634        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
635
636        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
637        let array_ref: ArrayRef = Arc::new(int32_array);
638        let block = DataBlock::from_array(array_ref);
639
640        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
641        let actual_bit_width = block.expect_stat(Stat::BitWidth);
642        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
643
644        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
645        let array_ref: ArrayRef = Arc::new(int32_array);
646        let block = DataBlock::from_array(array_ref);
647
648        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
649        let actual_bit_width = block.expect_stat(Stat::BitWidth);
650        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
651
652        let int32_array = Int32Array::from(vec![-1, 2, 3]);
653        let array_ref: ArrayRef = Arc::new(int32_array);
654        let block = DataBlock::from_array(array_ref);
655
656        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
657        let actual_bit_width = block.expect_stat(Stat::BitWidth);
658        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
659
660        let int32_array = Int32Array::from(vec![-1, 2, 3, -88]);
661        let array_ref: ArrayRef = Arc::new(int32_array);
662        let block = DataBlock::from_array(array_ref);
663
664        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
665        let actual_bit_width = block.expect_stat(Stat::BitWidth);
666        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
667
668        let int64_array = Int64Array::from(vec![1, 2, 3]);
669        let array_ref: ArrayRef = Arc::new(int64_array);
670        let block = DataBlock::from_array(array_ref);
671
672        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
673        let actual_bit_width = block.expect_stat(Stat::BitWidth);
674        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
675
676        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
677        let array_ref: ArrayRef = Arc::new(int64_array);
678        let block = DataBlock::from_array(array_ref);
679
680        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
681        let actual_bit_width = block.expect_stat(Stat::BitWidth);
682        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
683
684        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
685        let array_ref: ArrayRef = Arc::new(int64_array);
686        let block = DataBlock::from_array(array_ref);
687
688        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
689        let actual_bit_width = block.expect_stat(Stat::BitWidth);
690        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
691
692        let int64_array = Int64Array::from(vec![-1, 2, 3]);
693        let array_ref: ArrayRef = Arc::new(int64_array);
694        let block = DataBlock::from_array(array_ref);
695
696        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
697        let actual_bit_width = block.expect_stat(Stat::BitWidth);
698        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
699
700        let int64_array = Int64Array::from(vec![-1, 2, 3, -88]);
701        let array_ref: ArrayRef = Arc::new(int64_array);
702        let block = DataBlock::from_array(array_ref);
703
704        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
705        let actual_bit_width = block.expect_stat(Stat::BitWidth);
706        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
707
708        let uint8_array = UInt8Array::from(vec![1, 2, 3]);
709        let array_ref: ArrayRef = Arc::new(uint8_array);
710        let block = DataBlock::from_array(array_ref);
711
712        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
713        let actual_bit_width = block.expect_stat(Stat::BitWidth);
714        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
715
716        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
717        let array_ref: ArrayRef = Arc::new(uint8_array);
718        let block = DataBlock::from_array(array_ref);
719
720        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
721        let actual_bit_width = block.expect_stat(Stat::BitWidth);
722        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
723
724        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
725        let array_ref: ArrayRef = Arc::new(uint8_array);
726        let block = DataBlock::from_array(array_ref);
727
728        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
729        let actual_bit_width = block.expect_stat(Stat::BitWidth);
730        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
731
732        let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]);
733        let array_ref: ArrayRef = Arc::new(uint8_array);
734        let block = DataBlock::from_array(array_ref);
735
736        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
737        let actual_bit_width = block.expect_stat(Stat::BitWidth);
738        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
739
740        let uint16_array = UInt16Array::from(vec![1, 2, 3]);
741        let array_ref: ArrayRef = Arc::new(uint16_array);
742        let block = DataBlock::from_array(array_ref);
743
744        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
745        let actual_bit_width = block.expect_stat(Stat::BitWidth);
746        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
747
748        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
749        let array_ref: ArrayRef = Arc::new(uint16_array);
750        let block = DataBlock::from_array(array_ref);
751
752        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
753        let actual_bit_width = block.expect_stat(Stat::BitWidth);
754        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
755
756        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
757        let array_ref: ArrayRef = Arc::new(uint16_array);
758        let block = DataBlock::from_array(array_ref);
759
760        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
761        let actual_bit_width = block.expect_stat(Stat::BitWidth);
762        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
763
764        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
765        let array_ref: ArrayRef = Arc::new(uint16_array);
766        let block = DataBlock::from_array(array_ref);
767
768        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
769        let actual_bit_width = block.expect_stat(Stat::BitWidth);
770        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
771
772        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
773        let array_ref: ArrayRef = Arc::new(uint16_array);
774        let block = DataBlock::from_array(array_ref);
775
776        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
777        let actual_bit_width = block.expect_stat(Stat::BitWidth);
778        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
779
780        let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]);
781        let array_ref: ArrayRef = Arc::new(uint16_array);
782        let block = DataBlock::from_array(array_ref);
783
784        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
785        let actual_bit_width = block.expect_stat(Stat::BitWidth);
786        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
787
788        let uint32_array = UInt32Array::from(vec![1, 2, 3]);
789        let array_ref: ArrayRef = Arc::new(uint32_array);
790        let block = DataBlock::from_array(array_ref);
791
792        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
793        let actual_bit_width = block.expect_stat(Stat::BitWidth);
794        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
795
796        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
797        let array_ref: ArrayRef = Arc::new(uint32_array);
798        let block = DataBlock::from_array(array_ref);
799
800        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
801        let actual_bit_width = block.expect_stat(Stat::BitWidth);
802        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
803
804        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
805        let array_ref: ArrayRef = Arc::new(uint32_array);
806        let block = DataBlock::from_array(array_ref);
807
808        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
809        let actual_bit_width = block.expect_stat(Stat::BitWidth);
810        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
811
812        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]);
813        let array_ref: ArrayRef = Arc::new(uint32_array);
814        let block = DataBlock::from_array(array_ref);
815
816        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
817        let actual_bit_width = block.expect_stat(Stat::BitWidth);
818        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
819
820        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]);
821        let array_ref: ArrayRef = Arc::new(uint32_array);
822        let block = DataBlock::from_array(array_ref);
823
824        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
825        let actual_bit_width = block.expect_stat(Stat::BitWidth);
826        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
827
828        let uint64_array = UInt64Array::from(vec![1, 2, 3]);
829        let array_ref: ArrayRef = Arc::new(uint64_array);
830        let block = DataBlock::from_array(array_ref);
831
832        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
833        let actual_bit_width = block.expect_stat(Stat::BitWidth);
834        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
835
836        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
837        let array_ref: ArrayRef = Arc::new(uint64_array);
838        let block = DataBlock::from_array(array_ref);
839
840        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
841        let actual_bit_width = block.expect_stat(Stat::BitWidth);
842        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
843
844        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
845        let array_ref: ArrayRef = Arc::new(uint64_array);
846        let block = DataBlock::from_array(array_ref);
847
848        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
849        let actual_bit_width = block.expect_stat(Stat::BitWidth);
850        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
851
852        let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]);
853        let array_ref: ArrayRef = Arc::new(uint64_array);
854        let block = DataBlock::from_array(array_ref);
855
856        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
857        let actual_bit_width = block.expect_stat(Stat::BitWidth);
858        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
859
860        let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]);
861        let array_ref: ArrayRef = Arc::new(uint64_array);
862        let block = DataBlock::from_array(array_ref);
863
864        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
865        let actual_bit_width = block.expect_stat(Stat::BitWidth);
866        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
867    }
868
869    #[test]
870    fn test_bit_width_stat_more_than_1024() {
871        for data_type in [
872            DataType::Int8,
873            DataType::Int16,
874            DataType::Int32,
875            DataType::Int64,
876        ] {
877            let array1 = Int64Array::from(vec![3; 1024]);
878            let array2 = Int64Array::from(vec![8; 1024]);
879            let array3 = Int64Array::from(vec![-1; 10]);
880            let array1 = arrow_cast::cast(&array1, &data_type).unwrap();
881            let array2 = arrow_cast::cast(&array2, &data_type).unwrap();
882            let array3 = arrow_cast::cast(&array3, &data_type).unwrap();
883
884            let arrays: Vec<&dyn arrow::array::Array> =
885                vec![array1.as_ref(), array2.as_ref(), array3.as_ref()];
886            let concatenated = concat(&arrays).unwrap();
887            let block = DataBlock::from_array(concatenated.clone());
888
889            let expected_bit_width = Arc::new(UInt64Array::from(vec![
890                2,
891                4,
892                (data_type.byte_width() * 8) as u64,
893            ])) as ArrayRef;
894            let actual_bit_widths = block.expect_stat(Stat::BitWidth);
895            assert_eq!(actual_bit_widths.as_ref(), expected_bit_width.as_ref(),);
896        }
897    }
898
899    #[test]
900    fn test_bit_width_when_none() {
901        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
902        let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
903        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
904        let block = DataBlock::from_array(arr.clone());
905        assert!(block.get_stat(Stat::BitWidth).is_none(),);
906    }
907
908    #[test]
909    fn test_cardinality_variable_width_datablock() {
910        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
911        let block = DataBlock::from_array(string_array);
912        let expected_cardinality = 2;
913        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
914        assert_eq!(actual_cardinality, expected_cardinality,);
915
916        let string_array = StringArray::from(vec![
917            Some("to be named by variables"),
918            Some("to be passed as arguments to procedures"),
919            Some("to be returned as values of procedures"),
920        ]);
921        let block = DataBlock::from_array(string_array);
922        let expected_cardinality = 3;
923        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
924
925        assert_eq!(actual_cardinality, expected_cardinality,);
926
927        let string_array = StringArray::from(vec![
928            Some("Samuel Eilenberg"),
929            Some("Saunders Mac Lane"),
930            Some("Samuel Eilenberg"),
931        ]);
932        let block = DataBlock::from_array(string_array);
933        let expected_cardinality = 2;
934        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
935        assert_eq!(actual_cardinality, expected_cardinality,);
936
937        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
938        let block = DataBlock::from_array(string_array);
939        let expected_cardinality = 2;
940        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
941        assert_eq!(actual_cardinality, expected_cardinality,);
942
943        let string_array = LargeStringArray::from(vec![
944            Some("to be named by variables"),
945            Some("to be passed as arguments to procedures"),
946            Some("to be returned as values of procedures"),
947        ]);
948        let block = DataBlock::from_array(string_array);
949        let expected_cardinality = 3;
950        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
951        assert_eq!(actual_cardinality, expected_cardinality,);
952
953        let string_array = LargeStringArray::from(vec![
954            Some("Samuel Eilenberg"),
955            Some("Saunders Mac Lane"),
956            Some("Samuel Eilenberg"),
957        ]);
958        let block = DataBlock::from_array(string_array);
959        let expected_cardinality = 2;
960        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
961        assert_eq!(actual_cardinality, expected_cardinality,);
962    }
963
964    #[test]
965    fn test_max_length_variable_width_datablock() {
966        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
967        let block = DataBlock::from_array(string_array.clone());
968        let expected_max_length = string_array.value_length(0) as u64;
969        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
970        assert_eq!(actual_max_length, expected_max_length);
971
972        let string_array = StringArray::from(vec![
973            Some("to be named by variables"),
974            Some("to be passed as arguments to procedures"), // string that has max length
975            Some("to be returned as values of procedures"),
976        ]);
977        let block = DataBlock::from_array(string_array.clone());
978        let expected_max_length = string_array.value_length(1) as u64;
979        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
980        assert_eq!(actual_max_length, expected_max_length);
981
982        let string_array = StringArray::from(vec![
983            Some("Samuel Eilenberg"),
984            Some("Saunders Mac Lane"), // string that has max length
985            Some("Samuel Eilenberg"),
986        ]);
987        let block = DataBlock::from_array(string_array.clone());
988        let expected_max_length = string_array.value_length(1) as u64;
989        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
990        assert_eq!(actual_max_length, expected_max_length);
991
992        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
993        let block = DataBlock::from_array(string_array.clone());
994        let expected_max_length = string_array.value_length(1) as u64;
995        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
996        assert_eq!(actual_max_length, expected_max_length);
997
998        let string_array = LargeStringArray::from(vec![
999            Some("to be named by variables"),
1000            Some("to be passed as arguments to procedures"), // string that has max length
1001            Some("to be returned as values of procedures"),
1002        ]);
1003        let block = DataBlock::from_array(string_array.clone());
1004        let expected_max_length = string_array.value(1).len() as u64;
1005        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1006
1007        assert_eq!(actual_max_length, expected_max_length);
1008    }
1009}