Skip to main content

lance_encoding/
statistics.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{
5    fmt::{self},
6    hash::{Hash, RandomState},
7    sync::Arc,
8};
9
10use arrow_array::{cast::AsArray, types::UInt64Type, Array, ArrowPrimitiveType, UInt64Array};
11use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
12use num_traits::PrimInt;
13
14use crate::data::{
15    AllNullDataBlock, DataBlock, DictionaryDataBlock, FixedSizeListBlock, FixedWidthDataBlock,
16    NullableDataBlock, OpaqueBlock, StructDataBlock, VariableWidthBlock,
17};
18
19#[derive(Clone, Copy, PartialEq, Eq, Hash)]
20pub enum Stat {
21    BitWidth,
22    DataSize,
23    Cardinality,
24    FixedSize,
25    NullCount,
26    MaxLength,
27    RunCount,
28    BytePositionEntropy,
29}
30
31impl fmt::Debug for Stat {
32    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
33        match self {
34            Self::BitWidth => write!(f, "BitWidth"),
35            Self::DataSize => write!(f, "DataSize"),
36            Self::Cardinality => write!(f, "Cardinality"),
37            Self::FixedSize => write!(f, "FixedSize"),
38            Self::NullCount => write!(f, "NullCount"),
39            Self::MaxLength => write!(f, "MaxLength"),
40            Self::RunCount => write!(f, "RunCount"),
41            Self::BytePositionEntropy => write!(f, "BytePositionEntropy"),
42        }
43    }
44}
45
46impl fmt::Display for Stat {
47    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
48        write!(f, "{:?}", self)
49    }
50}
51
52pub trait ComputeStat {
53    fn compute_stat(&mut self);
54}
55
56impl ComputeStat for DataBlock {
57    fn compute_stat(&mut self) {
58        match self {
59            Self::Empty() => {}
60            Self::Constant(_) => {}
61            Self::AllNull(_) => {}
62            Self::Nullable(data_block) => data_block.data.compute_stat(),
63            Self::FixedWidth(data_block) => data_block.compute_stat(),
64            Self::FixedSizeList(data_block) => data_block.compute_stat(),
65            Self::VariableWidth(data_block) => data_block.compute_stat(),
66            Self::Opaque(data_block) => data_block.compute_stat(),
67            Self::Struct(data_block) => data_block.compute_stat(),
68            Self::Dictionary(_) => {}
69        }
70    }
71}
72
73impl ComputeStat for VariableWidthBlock {
74    fn compute_stat(&mut self) {
75        if !self.block_info.0.read().unwrap().is_empty() {
76            panic!("compute_stat should only be called once during DataBlock construction");
77        }
78        let data_size = self.data_size();
79        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
80
81        let cardinality_array = self.cardinality();
82
83        let max_length_array = self.max_length();
84
85        let mut info = self.block_info.0.write().unwrap();
86        info.insert(Stat::DataSize, data_size_array);
87        info.insert(Stat::Cardinality, cardinality_array);
88        info.insert(Stat::MaxLength, max_length_array);
89    }
90}
91
92impl ComputeStat for FixedWidthDataBlock {
93    fn compute_stat(&mut self) {
94        // compute this datablock's data_size
95        let data_size = self.data_size();
96        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
97
98        // compute this datablock's max_bit_width
99        let max_bit_widths = self.max_bit_widths();
100
101        // the MaxLength of FixedWidthDataBlock is it's self.bits_per_value / 8
102        let max_len = self.bits_per_value / 8;
103        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
104
105        let cardidinality_array = match self.bits_per_value {
106            64 | 128 => Some(self.cardinality()),
107            _ => None,
108        };
109
110        // compute run count
111        let run_count_array = self.run_count();
112
113        // compute byte position entropy
114        let byte_position_entropy = self.byte_position_entropy();
115
116        let mut info = self.block_info.0.write().unwrap();
117        info.insert(Stat::DataSize, data_size_array);
118        info.insert(Stat::BitWidth, max_bit_widths);
119        info.insert(Stat::MaxLength, max_len_array);
120        info.insert(Stat::RunCount, run_count_array);
121        info.insert(Stat::BytePositionEntropy, byte_position_entropy);
122        if let Some(cardinality_array) = cardidinality_array {
123            info.insert(Stat::Cardinality, cardinality_array);
124        }
125    }
126}
127
128impl ComputeStat for FixedSizeListBlock {
129    fn compute_stat(&mut self) {
130        // We leave the child stats unchanged.  This may seem odd (e.g. should bit width be the
131        // bit width of the child * dimension?) but it's because we use these stats to determine
132        // compression and we are currently just compressing the child data.
133        //
134        // There is a potential opportunity here to do better.  For example, if we have a FSL of
135        // 4 32-bit integers then we should probably treat them as a single 128-bit integer or maybe
136        // even 4 columns of 32-bit integers.  This might yield better compression.
137        self.child.compute_stat();
138    }
139}
140
141impl ComputeStat for OpaqueBlock {
142    fn compute_stat(&mut self) {
143        // compute this datablock's data_size
144        let data_size = self.data_size();
145        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
146        let mut info = self.block_info.0.write().unwrap();
147        info.insert(Stat::DataSize, data_size_array);
148    }
149}
150
151pub trait GetStat: fmt::Debug {
152    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>>;
153
154    fn expect_stat(&self, stat: Stat) -> Arc<dyn Array> {
155        self.get_stat(stat)
156            .unwrap_or_else(|| panic!("{:?} DataBlock does not have `{}` statistics.", self, stat))
157    }
158
159    fn expect_single_stat<T: ArrowPrimitiveType>(&self, stat: Stat) -> T::Native {
160        let stat_value = self.expect_stat(stat);
161        let stat_value = stat_value.as_primitive::<T>();
162        if stat_value.len() != 1 {
163            panic!(
164                "{:?} DataBlock does not have exactly one value for `{} statistics.",
165                self, stat
166            );
167        }
168        stat_value.value(0)
169    }
170}
171
172impl GetStat for DataBlock {
173    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
174        match self {
175            Self::Empty() => None,
176            Self::Constant(_) => None,
177            Self::AllNull(data_block) => data_block.get_stat(stat),
178            Self::Nullable(data_block) => data_block.get_stat(stat),
179            Self::FixedWidth(data_block) => data_block.get_stat(stat),
180            Self::FixedSizeList(data_block) => data_block.get_stat(stat),
181            Self::VariableWidth(data_block) => data_block.get_stat(stat),
182            Self::Opaque(data_block) => data_block.get_stat(stat),
183            Self::Struct(data_block) => data_block.get_stat(stat),
184            Self::Dictionary(data_block) => data_block.get_stat(stat),
185        }
186    }
187}
188
189// NullableDataBlock will be deprecated in Lance 2.1.
190impl GetStat for NullableDataBlock {
191    // This function simply returns the statistics of the inner `DataBlock` of `NullableDataBlock`,
192    // this is not accurate but `NullableDataBlock` is going to be deprecated in Lance 2.1 anyway.
193    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
194        self.data.get_stat(stat)
195    }
196}
197
198impl GetStat for VariableWidthBlock {
199    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
200        let block_info = self.block_info.0.read().unwrap();
201
202        if block_info.is_empty() {
203            panic!("get_stat should be called after statistics are computed.");
204        }
205        block_info.get(&stat).cloned()
206    }
207}
208
209impl GetStat for FixedSizeListBlock {
210    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
211        let child_stat = self.child.get_stat(stat);
212        match stat {
213            Stat::MaxLength => child_stat.map(|max_length| {
214                // this is conservative when working with variable length data as we shouldn't assume
215                // that we have a list of all max-length elements but it's cheap and easy to calculate
216                let max_length = max_length.as_primitive::<UInt64Type>().value(0);
217                Arc::new(UInt64Array::from(vec![max_length * self.dimension])) as Arc<dyn Array>
218            }),
219            _ => child_stat,
220        }
221    }
222}
223
224impl VariableWidthBlock {
225    // Caveat: the computation here assumes VariableWidthBlock.offsets maps directly to VariableWidthBlock.data
226    // without any adjustment(for example, no null_adjustment for offsets)
227    fn cardinality(&mut self) -> Arc<dyn Array> {
228        const PRECISION: u8 = 4;
229        // The default hasher (currently sip hash 1-3) does not seem to give good results
230        // with HLL.
231        //
232        // In particular, when using randomly generated 12-byte strings, the HLL count was
233        // suggested a cardinality of 500 (out of 1000 unique items and hashes) at least 10%
234        // of the time.
235        //
236        // Using xxhash3 consistently gives better results.
237        let mut hll: HyperLogLogPlus<&[u8], xxhash_rust::xxh3::Xxh3Builder> =
238            HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()).unwrap();
239
240        match self.bits_per_offset {
241            32 => {
242                let offsets_ref = self.offsets.borrow_to_typed_slice::<u32>();
243                let offsets: &[u32] = offsets_ref.as_ref();
244
245                offsets
246                    .iter()
247                    .zip(offsets.iter().skip(1))
248                    .for_each(|(&start, &end)| {
249                        hll.insert(&self.data[start as usize..end as usize]);
250                    });
251                let cardinality = hll.count() as u64;
252                Arc::new(UInt64Array::from(vec![cardinality]))
253            }
254            64 => {
255                let offsets_ref = self.offsets.borrow_to_typed_slice::<u64>();
256                let offsets: &[u64] = offsets_ref.as_ref();
257
258                offsets
259                    .iter()
260                    .zip(offsets.iter().skip(1))
261                    .for_each(|(&start, &end)| {
262                        hll.insert(&self.data[start as usize..end as usize]);
263                    });
264
265                let cardinality = hll.count() as u64;
266                Arc::new(UInt64Array::from(vec![cardinality]))
267            }
268            _ => {
269                unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64")
270            }
271        }
272    }
273
274    fn max_length(&mut self) -> Arc<dyn Array> {
275        match self.bits_per_offset {
276            32 => {
277                let offsets = self.offsets.borrow_to_typed_slice::<u32>();
278                let offsets = offsets.as_ref();
279                let max_len = offsets
280                    .windows(2)
281                    .map(|pair| pair[1] - pair[0])
282                    .max()
283                    .unwrap_or(0);
284                Arc::new(UInt64Array::from(vec![max_len as u64]))
285            }
286            64 => {
287                let offsets = self.offsets.borrow_to_typed_slice::<u64>();
288                let offsets = offsets.as_ref();
289                let max_len = offsets
290                    .windows(2)
291                    .map(|pair| pair[1] - pair[0])
292                    .max()
293                    .unwrap_or(0);
294                Arc::new(UInt64Array::from(vec![max_len]))
295            }
296            _ => {
297                unreachable!("the type of offsets in VariableWidth can only be u32 or u64");
298            }
299        }
300    }
301}
302
303impl GetStat for AllNullDataBlock {
304    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
305        match stat {
306            Stat::NullCount => {
307                let null_count = self.num_values;
308                Some(Arc::new(UInt64Array::from(vec![null_count])))
309            }
310            Stat::DataSize => Some(Arc::new(UInt64Array::from(vec![0]))),
311            _ => None,
312        }
313    }
314}
315
316impl GetStat for FixedWidthDataBlock {
317    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
318        let block_info = self.block_info.0.read().unwrap();
319
320        if block_info.is_empty() {
321            panic!("get_stat should be called after statistics are computed.");
322        }
323        block_info.get(&stat).cloned()
324    }
325}
326
327impl FixedWidthDataBlock {
328    fn max_bit_widths(&mut self) -> Arc<dyn Array> {
329        if self.num_values == 0 {
330            return Arc::new(UInt64Array::from(vec![0u64]));
331        }
332
333        const CHUNK_SIZE: usize = 1024;
334
335        fn calculate_max_bit_width<T: PrimInt>(slice: &[T], bits_per_value: u64) -> Vec<u64> {
336            slice
337                .chunks(CHUNK_SIZE)
338                .map(|chunk| {
339                    let max_value = chunk.iter().fold(T::zero(), |acc, &x| acc | x);
340                    bits_per_value - max_value.leading_zeros() as u64
341                })
342                .collect()
343        }
344
345        match self.bits_per_value {
346            8 => {
347                let u8_slice = self.data.borrow_to_typed_slice::<u8>();
348                let u8_slice = u8_slice.as_ref();
349                Arc::new(UInt64Array::from(calculate_max_bit_width(
350                    u8_slice,
351                    self.bits_per_value,
352                )))
353            }
354            16 => {
355                let u16_slice = self.data.borrow_to_typed_slice::<u16>();
356                let u16_slice = u16_slice.as_ref();
357                Arc::new(UInt64Array::from(calculate_max_bit_width(
358                    u16_slice,
359                    self.bits_per_value,
360                )))
361            }
362            32 => {
363                let u32_slice = self.data.borrow_to_typed_slice::<u32>();
364                let u32_slice = u32_slice.as_ref();
365                Arc::new(UInt64Array::from(calculate_max_bit_width(
366                    u32_slice,
367                    self.bits_per_value,
368                )))
369            }
370            64 => {
371                let u64_slice = self.data.borrow_to_typed_slice::<u64>();
372                let u64_slice = u64_slice.as_ref();
373                Arc::new(UInt64Array::from(calculate_max_bit_width(
374                    u64_slice,
375                    self.bits_per_value,
376                )))
377            }
378            _ => Arc::new(UInt64Array::from(vec![self.bits_per_value])),
379        }
380    }
381
382    fn cardinality(&mut self) -> Arc<dyn Array> {
383        match self.bits_per_value {
384            64 => {
385                let u64_slice_ref = self.data.borrow_to_typed_slice::<u64>();
386                let u64_slice = u64_slice_ref.as_ref();
387
388                const PRECISION: u8 = 4;
389                let mut hll: HyperLogLogPlus<u64, xxhash_rust::xxh3::Xxh3Builder> =
390                    HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default())
391                        .unwrap();
392                for val in u64_slice {
393                    hll.insert(val);
394                }
395                let cardinality = hll.count() as u64;
396                Arc::new(UInt64Array::from(vec![cardinality]))
397            }
398            128 => {
399                let u128_slice_ref = self.data.borrow_to_typed_slice::<u128>();
400                let u128_slice = u128_slice_ref.as_ref();
401
402                const PRECISION: u8 = 4;
403                let mut hll: HyperLogLogPlus<u128, RandomState> =
404                    HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
405                for val in u128_slice {
406                    hll.insert(val);
407                }
408                let cardinality = hll.count() as u64;
409                Arc::new(UInt64Array::from(vec![cardinality]))
410            }
411            _ => unreachable!(),
412        }
413    }
414
415    /// Counts the number of runs (consecutive sequences of equal values) in the data.
416    ///
417    /// A "run" is defined as a sequence of one or more consecutive equal values.
418    /// For example:
419    /// - `[1, 1, 2, 2, 2, 3]` has 3 runs: [1,1], [2,2,2], and [3]
420    /// - `[1, 2, 3, 4]` has 4 runs (each value is its own run)
421    /// - `[5, 5, 5, 5]` has 1 run
422    ///
423    /// This count is used to determine if RLE compression would be effective.
424    /// Fewer runs relative to the total number of values indicates better RLE compression potential.
425    fn run_count(&mut self) -> Arc<dyn Array> {
426        if self.num_values == 0 {
427            return Arc::new(UInt64Array::from(vec![0u64]));
428        }
429
430        // Inner function to count runs in typed data
431        fn count_runs<T: PartialEq + Copy>(slice: &[T]) -> u64 {
432            if slice.is_empty() {
433                return 0;
434            }
435
436            // Start with 1 run (the first value)
437            let mut runs = 1u64;
438            let mut prev = slice[0];
439
440            // Count value transitions (each transition indicates a new run)
441            for &val in &slice[1..] {
442                if val != prev {
443                    runs += 1;
444                    prev = val;
445                }
446            }
447
448            runs
449        }
450
451        let run_count = match self.bits_per_value {
452            8 => {
453                let u8_slice = self.data.borrow_to_typed_slice::<u8>();
454                count_runs(u8_slice.as_ref())
455            }
456            16 => {
457                let u16_slice = self.data.borrow_to_typed_slice::<u16>();
458                count_runs(u16_slice.as_ref())
459            }
460            32 => {
461                let u32_slice = self.data.borrow_to_typed_slice::<u32>();
462                count_runs(u32_slice.as_ref())
463            }
464            64 => {
465                let u64_slice = self.data.borrow_to_typed_slice::<u64>();
466                count_runs(u64_slice.as_ref())
467            }
468            128 => {
469                let u128_slice = self.data.borrow_to_typed_slice::<u128>();
470                count_runs(u128_slice.as_ref())
471            }
472            _ => self.num_values, // For other bit widths, assume no runs
473        };
474
475        Arc::new(UInt64Array::from(vec![run_count]))
476    }
477
478    /// Calculates entropy for each byte position.
479    /// Returns an array with entropy values for each byte position (scaled by 1000 for integer storage).
480    /// Lower entropy in specific byte positions indicates better suitability for BSS.
481    fn byte_position_entropy(&mut self) -> Arc<dyn Array> {
482        const SAMPLE_SIZE: usize = 64; // Sample more values for better entropy estimation
483
484        // Get sample size (min of data length and SAMPLE_SIZE)
485        let sample_count = (self.num_values as usize).min(SAMPLE_SIZE);
486
487        if sample_count == 0 {
488            // Return empty array for empty data
489            return Arc::new(UInt64Array::from(vec![] as Vec<u64>));
490        }
491
492        let bytes_per_value = (self.bits_per_value / 8) as usize;
493        let mut entropies = Vec::with_capacity(bytes_per_value);
494
495        // Calculate entropy for each byte position
496        for pos in 0..bytes_per_value {
497            let mut byte_counts = [0u32; 256];
498
499            // Count occurrences of each byte value at this position
500            for i in 0..sample_count {
501                let byte_offset = i * bytes_per_value + pos;
502                if byte_offset < self.data.len() {
503                    byte_counts[self.data[byte_offset] as usize] += 1;
504                }
505            }
506
507            // Calculate Shannon entropy for this position
508            let mut entropy = 0.0f64;
509            let total = sample_count as f64;
510
511            for &count in &byte_counts {
512                if count > 0 {
513                    let p = count as f64 / total;
514                    entropy -= p * p.log2();
515                }
516            }
517
518            // Scale by 1000 and store as integer for efficient storage
519            entropies.push((entropy * 1000.0) as u64);
520        }
521
522        Arc::new(UInt64Array::from(entropies))
523    }
524}
525
526impl GetStat for OpaqueBlock {
527    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
528        let block_info = self.block_info.0.read().unwrap();
529
530        if block_info.is_empty() {
531            panic!("get_stat should be called after statistics are computed.");
532        }
533        block_info.get(&stat).cloned()
534    }
535}
536
537impl GetStat for DictionaryDataBlock {
538    fn get_stat(&self, _stat: Stat) -> Option<Arc<dyn Array>> {
539        None
540    }
541}
542
543impl GetStat for StructDataBlock {
544    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
545        let block_info = self.block_info.0.read().unwrap();
546        if block_info.is_empty() {
547            panic!("get_stat should be called after statistics are computed.")
548        }
549        block_info.get(&stat).cloned()
550    }
551}
552
553impl ComputeStat for StructDataBlock {
554    fn compute_stat(&mut self) {
555        let data_size = self.data_size();
556        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
557
558        let max_len = self
559            .children
560            .iter()
561            .map(|child| child.expect_single_stat::<UInt64Type>(Stat::MaxLength))
562            .sum::<u64>();
563        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
564
565        let mut info = self.block_info.0.write().unwrap();
566        info.insert(Stat::DataSize, data_size_array);
567        info.insert(Stat::MaxLength, max_len_array);
568    }
569}
570
571#[cfg(test)]
572mod tests {
573    use std::sync::Arc;
574
575    use arrow_array::{
576        ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
577        UInt16Array, UInt32Array, UInt64Array, UInt8Array,
578    };
579    use arrow_schema::{DataType, Field};
580    use lance_arrow::DataTypeExt;
581    use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED};
582    use rand::SeedableRng;
583
584    use crate::statistics::{GetStat, Stat};
585
586    use super::DataBlock;
587
588    use arrow_array::{
589        cast::AsArray,
590        types::{Int32Type, UInt64Type},
591        Array,
592    };
593    use arrow_select::concat::concat;
594    #[test]
595    fn test_data_size_stat() {
596        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
597        let mut genn = array::rand::<Int32Type>().with_nulls(&[false, false, false]);
598        let arr1 = genn.generate(RowCount::from(3), &mut rng).unwrap();
599        let arr2 = genn.generate(RowCount::from(3), &mut rng).unwrap();
600        let arr3 = genn.generate(RowCount::from(3), &mut rng).unwrap();
601        let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9);
602
603        let concatenated_array = concat(&[
604            &*Arc::new(arr1.clone()) as &dyn Array,
605            &*Arc::new(arr2.clone()) as &dyn Array,
606            &*Arc::new(arr3.clone()) as &dyn Array,
607        ])
608        .unwrap();
609
610        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
611
612        let total_buffer_size: usize = concatenated_array
613            .to_data()
614            .buffers()
615            .iter()
616            .map(|buffer| buffer.len())
617            .sum();
618        assert!(data_size == total_buffer_size as u64);
619
620        // test DataType::Binary
621        let mut genn = lance_datagen::array::rand_type(&DataType::Binary);
622        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
623        let block = DataBlock::from_array(arr.clone());
624        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
625
626        let total_buffer_size: usize = arr
627            .to_data()
628            .buffers()
629            .iter()
630            .map(|buffer| buffer.len())
631            .sum();
632        assert!(data_size == total_buffer_size as u64);
633
634        // test DataType::Struct
635        let fields = vec![
636            Arc::new(Field::new("int_field", DataType::Int32, false)),
637            Arc::new(Field::new("float_field", DataType::Float32, false)),
638        ]
639        .into();
640
641        let mut genn = lance_datagen::array::rand_type(&DataType::Struct(fields));
642        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
643        let block = DataBlock::from_array(arr.clone());
644        let (_, arr_parts, _) = arr.as_struct().clone().into_parts();
645        let total_buffer_size: usize = arr_parts
646            .iter()
647            .map(|arr| {
648                arr.to_data()
649                    .buffers()
650                    .iter()
651                    .map(|buffer| buffer.len())
652                    .sum::<usize>()
653            })
654            .sum();
655        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
656        assert!(data_size == total_buffer_size as u64);
657
658        // test DataType::Dictionary
659        let mut genn = array::rand_type(&DataType::Dictionary(
660            Box::new(DataType::Int32),
661            Box::new(DataType::Utf8),
662        ));
663        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
664        let block = DataBlock::from_array(arr.clone());
665        assert!(block.get_stat(Stat::DataSize).is_none());
666
667        let mut genn = array::rand::<Int32Type>().with_nulls(&[false, true, false]);
668        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
669        let block = DataBlock::from_array(arr.clone());
670        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
671        let total_buffer_size: usize = arr
672            .to_data()
673            .buffers()
674            .iter()
675            .map(|buffer| buffer.len())
676            .sum();
677
678        assert!(data_size == total_buffer_size as u64);
679    }
680
681    #[test]
682    fn test_bit_width_stat_for_integers() {
683        let int8_array = Int8Array::from(vec![1, 2, 3]);
684        let array_ref: ArrayRef = Arc::new(int8_array);
685        let block = DataBlock::from_array(array_ref);
686
687        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
688        let actual_bit_width = block.expect_stat(Stat::BitWidth);
689
690        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
691
692        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
693        let array_ref: ArrayRef = Arc::new(int8_array);
694        let block = DataBlock::from_array(array_ref);
695
696        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
697        let actual_bit_width = block.expect_stat(Stat::BitWidth);
698        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
699
700        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
701        let array_ref: ArrayRef = Arc::new(int8_array);
702        let block = DataBlock::from_array(array_ref);
703
704        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
705        let actual_bit_width = block.expect_stat(Stat::BitWidth);
706        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
707
708        let int8_array = Int8Array::from(vec![-1, 2, 3]);
709        let array_ref: ArrayRef = Arc::new(int8_array);
710        let block = DataBlock::from_array(array_ref);
711
712        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
713        let actual_bit_width = block.expect_stat(Stat::BitWidth);
714        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
715
716        let int16_array = Int16Array::from(vec![1, 2, 3]);
717        let array_ref: ArrayRef = Arc::new(int16_array);
718        let block = DataBlock::from_array(array_ref);
719
720        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
721        let actual_bit_width = block.expect_stat(Stat::BitWidth);
722        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
723
724        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
725        let array_ref: ArrayRef = Arc::new(int16_array);
726        let block = DataBlock::from_array(array_ref);
727
728        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
729        let actual_bit_width = block.expect_stat(Stat::BitWidth);
730        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
731
732        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
733        let array_ref: ArrayRef = Arc::new(int16_array);
734        let block = DataBlock::from_array(array_ref);
735
736        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
737        let actual_bit_width = block.expect_stat(Stat::BitWidth);
738        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
739
740        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
741        let array_ref: ArrayRef = Arc::new(int16_array);
742        let block = DataBlock::from_array(array_ref);
743
744        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
745        let actual_bit_width = block.expect_stat(Stat::BitWidth);
746        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
747
748        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
749        let array_ref: ArrayRef = Arc::new(int16_array);
750        let block = DataBlock::from_array(array_ref);
751
752        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
753        let actual_bit_width = block.expect_stat(Stat::BitWidth);
754        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
755
756        let int16_array = Int16Array::from(vec![-1, 2, 3]);
757        let array_ref: ArrayRef = Arc::new(int16_array);
758        let block = DataBlock::from_array(array_ref);
759
760        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
761        let actual_bit_width = block.expect_stat(Stat::BitWidth);
762        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
763
764        let int32_array = Int32Array::from(vec![1, 2, 3]);
765        let array_ref: ArrayRef = Arc::new(int32_array);
766        let block = DataBlock::from_array(array_ref);
767
768        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
769        let actual_bit_width = block.expect_stat(Stat::BitWidth);
770        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
771
772        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
773        let array_ref: ArrayRef = Arc::new(int32_array);
774        let block = DataBlock::from_array(array_ref);
775
776        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
777        let actual_bit_width = block.expect_stat(Stat::BitWidth);
778        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
779
780        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
781        let array_ref: ArrayRef = Arc::new(int32_array);
782        let block = DataBlock::from_array(array_ref);
783
784        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
785        let actual_bit_width = block.expect_stat(Stat::BitWidth);
786        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
787
788        let int32_array = Int32Array::from(vec![-1, 2, 3]);
789        let array_ref: ArrayRef = Arc::new(int32_array);
790        let block = DataBlock::from_array(array_ref);
791
792        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
793        let actual_bit_width = block.expect_stat(Stat::BitWidth);
794        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
795
796        let int32_array = Int32Array::from(vec![-1, 2, 3, -88]);
797        let array_ref: ArrayRef = Arc::new(int32_array);
798        let block = DataBlock::from_array(array_ref);
799
800        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
801        let actual_bit_width = block.expect_stat(Stat::BitWidth);
802        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
803
804        let int64_array = Int64Array::from(vec![1, 2, 3]);
805        let array_ref: ArrayRef = Arc::new(int64_array);
806        let block = DataBlock::from_array(array_ref);
807
808        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
809        let actual_bit_width = block.expect_stat(Stat::BitWidth);
810        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
811
812        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
813        let array_ref: ArrayRef = Arc::new(int64_array);
814        let block = DataBlock::from_array(array_ref);
815
816        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
817        let actual_bit_width = block.expect_stat(Stat::BitWidth);
818        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
819
820        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
821        let array_ref: ArrayRef = Arc::new(int64_array);
822        let block = DataBlock::from_array(array_ref);
823
824        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
825        let actual_bit_width = block.expect_stat(Stat::BitWidth);
826        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
827
828        let int64_array = Int64Array::from(vec![-1, 2, 3]);
829        let array_ref: ArrayRef = Arc::new(int64_array);
830        let block = DataBlock::from_array(array_ref);
831
832        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
833        let actual_bit_width = block.expect_stat(Stat::BitWidth);
834        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
835
836        let int64_array = Int64Array::from(vec![-1, 2, 3, -88]);
837        let array_ref: ArrayRef = Arc::new(int64_array);
838        let block = DataBlock::from_array(array_ref);
839
840        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
841        let actual_bit_width = block.expect_stat(Stat::BitWidth);
842        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
843
844        let uint8_array = UInt8Array::from(vec![1, 2, 3]);
845        let array_ref: ArrayRef = Arc::new(uint8_array);
846        let block = DataBlock::from_array(array_ref);
847
848        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
849        let actual_bit_width = block.expect_stat(Stat::BitWidth);
850        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
851
852        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
853        let array_ref: ArrayRef = Arc::new(uint8_array);
854        let block = DataBlock::from_array(array_ref);
855
856        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
857        let actual_bit_width = block.expect_stat(Stat::BitWidth);
858        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
859
860        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
861        let array_ref: ArrayRef = Arc::new(uint8_array);
862        let block = DataBlock::from_array(array_ref);
863
864        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
865        let actual_bit_width = block.expect_stat(Stat::BitWidth);
866        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
867
868        let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]);
869        let array_ref: ArrayRef = Arc::new(uint8_array);
870        let block = DataBlock::from_array(array_ref);
871
872        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
873        let actual_bit_width = block.expect_stat(Stat::BitWidth);
874        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
875
876        let uint16_array = UInt16Array::from(vec![1, 2, 3]);
877        let array_ref: ArrayRef = Arc::new(uint16_array);
878        let block = DataBlock::from_array(array_ref);
879
880        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
881        let actual_bit_width = block.expect_stat(Stat::BitWidth);
882        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
883
884        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
885        let array_ref: ArrayRef = Arc::new(uint16_array);
886        let block = DataBlock::from_array(array_ref);
887
888        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
889        let actual_bit_width = block.expect_stat(Stat::BitWidth);
890        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
891
892        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
893        let array_ref: ArrayRef = Arc::new(uint16_array);
894        let block = DataBlock::from_array(array_ref);
895
896        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
897        let actual_bit_width = block.expect_stat(Stat::BitWidth);
898        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
899
900        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
901        let array_ref: ArrayRef = Arc::new(uint16_array);
902        let block = DataBlock::from_array(array_ref);
903
904        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
905        let actual_bit_width = block.expect_stat(Stat::BitWidth);
906        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
907
908        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
909        let array_ref: ArrayRef = Arc::new(uint16_array);
910        let block = DataBlock::from_array(array_ref);
911
912        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
913        let actual_bit_width = block.expect_stat(Stat::BitWidth);
914        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
915
916        let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]);
917        let array_ref: ArrayRef = Arc::new(uint16_array);
918        let block = DataBlock::from_array(array_ref);
919
920        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
921        let actual_bit_width = block.expect_stat(Stat::BitWidth);
922        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
923
924        let uint32_array = UInt32Array::from(vec![1, 2, 3]);
925        let array_ref: ArrayRef = Arc::new(uint32_array);
926        let block = DataBlock::from_array(array_ref);
927
928        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
929        let actual_bit_width = block.expect_stat(Stat::BitWidth);
930        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
931
932        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
933        let array_ref: ArrayRef = Arc::new(uint32_array);
934        let block = DataBlock::from_array(array_ref);
935
936        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
937        let actual_bit_width = block.expect_stat(Stat::BitWidth);
938        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
939
940        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
941        let array_ref: ArrayRef = Arc::new(uint32_array);
942        let block = DataBlock::from_array(array_ref);
943
944        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
945        let actual_bit_width = block.expect_stat(Stat::BitWidth);
946        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
947
948        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]);
949        let array_ref: ArrayRef = Arc::new(uint32_array);
950        let block = DataBlock::from_array(array_ref);
951
952        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
953        let actual_bit_width = block.expect_stat(Stat::BitWidth);
954        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
955
956        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]);
957        let array_ref: ArrayRef = Arc::new(uint32_array);
958        let block = DataBlock::from_array(array_ref);
959
960        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
961        let actual_bit_width = block.expect_stat(Stat::BitWidth);
962        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
963
964        let uint64_array = UInt64Array::from(vec![1, 2, 3]);
965        let array_ref: ArrayRef = Arc::new(uint64_array);
966        let block = DataBlock::from_array(array_ref);
967
968        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
969        let actual_bit_width = block.expect_stat(Stat::BitWidth);
970        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
971
972        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
973        let array_ref: ArrayRef = Arc::new(uint64_array);
974        let block = DataBlock::from_array(array_ref);
975
976        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
977        let actual_bit_width = block.expect_stat(Stat::BitWidth);
978        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
979
980        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
981        let array_ref: ArrayRef = Arc::new(uint64_array);
982        let block = DataBlock::from_array(array_ref);
983
984        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
985        let actual_bit_width = block.expect_stat(Stat::BitWidth);
986        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
987
988        let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]);
989        let array_ref: ArrayRef = Arc::new(uint64_array);
990        let block = DataBlock::from_array(array_ref);
991
992        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
993        let actual_bit_width = block.expect_stat(Stat::BitWidth);
994        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
995
996        let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]);
997        let array_ref: ArrayRef = Arc::new(uint64_array);
998        let block = DataBlock::from_array(array_ref);
999
1000        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
1001        let actual_bit_width = block.expect_stat(Stat::BitWidth);
1002        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
1003    }
1004
1005    #[test]
1006    fn test_bit_width_stat_more_than_1024() {
1007        for data_type in [
1008            DataType::Int8,
1009            DataType::Int16,
1010            DataType::Int32,
1011            DataType::Int64,
1012        ] {
1013            let array1 = Int64Array::from(vec![3; 1024]);
1014            let array2 = Int64Array::from(vec![8; 1024]);
1015            let array3 = Int64Array::from(vec![-1; 10]);
1016            let array1 = arrow_cast::cast(&array1, &data_type).unwrap();
1017            let array2 = arrow_cast::cast(&array2, &data_type).unwrap();
1018            let array3 = arrow_cast::cast(&array3, &data_type).unwrap();
1019
1020            let arrays: Vec<&dyn arrow_array::Array> =
1021                vec![array1.as_ref(), array2.as_ref(), array3.as_ref()];
1022            let concatenated = concat(&arrays).unwrap();
1023            let block = DataBlock::from_array(concatenated.clone());
1024
1025            let expected_bit_width = Arc::new(UInt64Array::from(vec![
1026                2,
1027                4,
1028                (data_type.byte_width() * 8) as u64,
1029            ])) as ArrayRef;
1030            let actual_bit_widths = block.expect_stat(Stat::BitWidth);
1031            assert_eq!(actual_bit_widths.as_ref(), expected_bit_width.as_ref(),);
1032        }
1033    }
1034
1035    #[test]
1036    fn test_bit_width_when_none() {
1037        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
1038        let mut genn = lance_datagen::array::rand_type(&DataType::Binary);
1039        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
1040        let block = DataBlock::from_array(arr.clone());
1041        assert!(block.get_stat(Stat::BitWidth).is_none(),);
1042    }
1043
1044    #[test]
1045    fn test_cardinality_variable_width_datablock() {
1046        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
1047        let block = DataBlock::from_array(string_array);
1048        let expected_cardinality = 2;
1049        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1050        assert_eq!(actual_cardinality, expected_cardinality,);
1051
1052        let string_array = StringArray::from(vec![
1053            Some("to be named by variables"),
1054            Some("to be passed as arguments to procedures"),
1055            Some("to be returned as values of procedures"),
1056        ]);
1057        let block = DataBlock::from_array(string_array);
1058        let expected_cardinality = 3;
1059        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1060
1061        assert_eq!(actual_cardinality, expected_cardinality,);
1062
1063        let string_array = StringArray::from(vec![
1064            Some("Samuel Eilenberg"),
1065            Some("Saunders Mac Lane"),
1066            Some("Samuel Eilenberg"),
1067        ]);
1068        let block = DataBlock::from_array(string_array);
1069        let expected_cardinality = 2;
1070        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1071        assert_eq!(actual_cardinality, expected_cardinality,);
1072
1073        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
1074        let block = DataBlock::from_array(string_array);
1075        let expected_cardinality = 2;
1076        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1077        assert_eq!(actual_cardinality, expected_cardinality,);
1078
1079        let string_array = LargeStringArray::from(vec![
1080            Some("to be named by variables"),
1081            Some("to be passed as arguments to procedures"),
1082            Some("to be returned as values of procedures"),
1083        ]);
1084        let block = DataBlock::from_array(string_array);
1085        let expected_cardinality = 3;
1086        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1087        assert_eq!(actual_cardinality, expected_cardinality,);
1088
1089        let string_array = LargeStringArray::from(vec![
1090            Some("Samuel Eilenberg"),
1091            Some("Saunders Mac Lane"),
1092            Some("Samuel Eilenberg"),
1093        ]);
1094        let block = DataBlock::from_array(string_array);
1095        let expected_cardinality = 2;
1096        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1097        assert_eq!(actual_cardinality, expected_cardinality,);
1098    }
1099
1100    #[test]
1101    fn test_max_length_variable_width_datablock() {
1102        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
1103        let block = DataBlock::from_array(string_array.clone());
1104        let expected_max_length = string_array.value_length(0) as u64;
1105        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1106        assert_eq!(actual_max_length, expected_max_length);
1107
1108        let string_array = StringArray::from(vec![
1109            Some("to be named by variables"),
1110            Some("to be passed as arguments to procedures"), // string that has max length
1111            Some("to be returned as values of procedures"),
1112        ]);
1113        let block = DataBlock::from_array(string_array.clone());
1114        let expected_max_length = string_array.value_length(1) as u64;
1115        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1116        assert_eq!(actual_max_length, expected_max_length);
1117
1118        let string_array = StringArray::from(vec![
1119            Some("Samuel Eilenberg"),
1120            Some("Saunders Mac Lane"), // string that has max length
1121            Some("Samuel Eilenberg"),
1122        ]);
1123        let block = DataBlock::from_array(string_array.clone());
1124        let expected_max_length = string_array.value_length(1) as u64;
1125        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1126        assert_eq!(actual_max_length, expected_max_length);
1127
1128        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
1129        let block = DataBlock::from_array(string_array.clone());
1130        let expected_max_length = string_array.value_length(1) as u64;
1131        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1132        assert_eq!(actual_max_length, expected_max_length);
1133
1134        let string_array = LargeStringArray::from(vec![
1135            Some("to be named by variables"),
1136            Some("to be passed as arguments to procedures"), // string that has max length
1137            Some("to be returned as values of procedures"),
1138        ]);
1139        let block = DataBlock::from_array(string_array.clone());
1140        let expected_max_length = string_array.value(1).len() as u64;
1141        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1142
1143        assert_eq!(actual_max_length, expected_max_length);
1144    }
1145
1146    #[test]
1147    fn test_run_count_stat() {
1148        // Test with highly repetitive data
1149        let int32_array = Int32Array::from(vec![1, 1, 1, 2, 2, 2, 3, 3, 3]);
1150        let block = DataBlock::from_array(int32_array);
1151        let expected_run_count = 3;
1152        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1153        assert_eq!(actual_run_count, expected_run_count);
1154
1155        // Test with no repetition
1156        let int32_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
1157        let block = DataBlock::from_array(int32_array);
1158        let expected_run_count = 5;
1159        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1160        assert_eq!(actual_run_count, expected_run_count);
1161
1162        // Test with mixed pattern
1163        let int32_array = Int32Array::from(vec![1, 1, 2, 3, 3, 3, 4, 5, 5]);
1164        let block = DataBlock::from_array(int32_array);
1165        let expected_run_count = 5;
1166        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1167        assert_eq!(actual_run_count, expected_run_count);
1168
1169        // Test with single value
1170        let int32_array = Int32Array::from(vec![42, 42, 42, 42, 42]);
1171        let block = DataBlock::from_array(int32_array);
1172        let expected_run_count = 1;
1173        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1174        assert_eq!(actual_run_count, expected_run_count);
1175
1176        // Test with different data types
1177        let uint8_array = UInt8Array::from(vec![1, 1, 2, 2, 3, 3]);
1178        let block = DataBlock::from_array(uint8_array);
1179        let expected_run_count = 3;
1180        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1181        assert_eq!(actual_run_count, expected_run_count);
1182
1183        let int64_array = Int64Array::from(vec![100, 100, 200, 300, 300]);
1184        let block = DataBlock::from_array(int64_array);
1185        let expected_run_count = 3;
1186        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1187        assert_eq!(actual_run_count, expected_run_count);
1188    }
1189}