lance_encoding/
statistics.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{
5    fmt::{self},
6    hash::{Hash, RandomState},
7    sync::Arc,
8};
9
10use arrow_array::{cast::AsArray, types::UInt64Type, Array, ArrowPrimitiveType, UInt64Array};
11use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
12use num_traits::PrimInt;
13
14use crate::data::{
15    AllNullDataBlock, DataBlock, DictionaryDataBlock, FixedSizeListBlock, FixedWidthDataBlock,
16    NullableDataBlock, OpaqueBlock, StructDataBlock, VariableWidthBlock,
17};
18
19#[derive(Clone, Copy, PartialEq, Eq, Hash)]
20pub enum Stat {
21    BitWidth,
22    DataSize,
23    Cardinality,
24    FixedSize,
25    NullCount,
26    MaxLength,
27    RunCount,
28    BytePositionEntropy,
29}
30
31impl fmt::Debug for Stat {
32    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
33        match self {
34            Self::BitWidth => write!(f, "BitWidth"),
35            Self::DataSize => write!(f, "DataSize"),
36            Self::Cardinality => write!(f, "Cardinality"),
37            Self::FixedSize => write!(f, "FixedSize"),
38            Self::NullCount => write!(f, "NullCount"),
39            Self::MaxLength => write!(f, "MaxLength"),
40            Self::RunCount => write!(f, "RunCount"),
41            Self::BytePositionEntropy => write!(f, "BytePositionEntropy"),
42        }
43    }
44}
45
46impl fmt::Display for Stat {
47    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
48        write!(f, "{:?}", self)
49    }
50}
51
52pub trait ComputeStat {
53    fn compute_stat(&mut self);
54}
55
56impl ComputeStat for DataBlock {
57    fn compute_stat(&mut self) {
58        match self {
59            Self::Empty() => {}
60            Self::Constant(_) => {}
61            Self::AllNull(_) => {}
62            Self::Nullable(data_block) => data_block.data.compute_stat(),
63            Self::FixedWidth(data_block) => data_block.compute_stat(),
64            Self::FixedSizeList(data_block) => data_block.compute_stat(),
65            Self::VariableWidth(data_block) => data_block.compute_stat(),
66            Self::Opaque(data_block) => data_block.compute_stat(),
67            Self::Struct(data_block) => data_block.compute_stat(),
68            Self::Dictionary(_) => {}
69        }
70    }
71}
72
73impl ComputeStat for VariableWidthBlock {
74    fn compute_stat(&mut self) {
75        if !self.block_info.0.read().unwrap().is_empty() {
76            panic!("compute_stat should only be called once during DataBlock construction");
77        }
78        let data_size = self.data_size();
79        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
80
81        let cardinality_array = self.cardinality();
82
83        let max_length_array = self.max_length();
84
85        let mut info = self.block_info.0.write().unwrap();
86        info.insert(Stat::DataSize, data_size_array);
87        info.insert(Stat::Cardinality, cardinality_array);
88        info.insert(Stat::MaxLength, max_length_array);
89    }
90}
91
92impl ComputeStat for FixedWidthDataBlock {
93    fn compute_stat(&mut self) {
94        // compute this datablock's data_size
95        let data_size = self.data_size();
96        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
97
98        // compute this datablock's max_bit_width
99        let max_bit_widths = self.max_bit_widths();
100
101        // the MaxLength of FixedWidthDataBlock is it's self.bits_per_value / 8
102        let max_len = self.bits_per_value / 8;
103        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
104
105        let cardidinality_array = if self.bits_per_value == 128 {
106            Some(self.cardinality())
107        } else {
108            None
109        };
110
111        // compute run count
112        let run_count_array = self.run_count();
113
114        // compute byte position entropy
115        let byte_position_entropy = self.byte_position_entropy();
116
117        let mut info = self.block_info.0.write().unwrap();
118        info.insert(Stat::DataSize, data_size_array);
119        info.insert(Stat::BitWidth, max_bit_widths);
120        info.insert(Stat::MaxLength, max_len_array);
121        info.insert(Stat::RunCount, run_count_array);
122        info.insert(Stat::BytePositionEntropy, byte_position_entropy);
123        if let Some(cardinality_array) = cardidinality_array {
124            info.insert(Stat::Cardinality, cardinality_array);
125        }
126    }
127}
128
129impl ComputeStat for FixedSizeListBlock {
130    fn compute_stat(&mut self) {
131        // We leave the child stats unchanged.  This may seem odd (e.g. should bit width be the
132        // bit width of the child * dimension?) but it's because we use these stats to determine
133        // compression and we are currently just compressing the child data.
134        //
135        // There is a potential opportunity here to do better.  For example, if we have a FSL of
136        // 4 32-bit integers then we should probably treat them as a single 128-bit integer or maybe
137        // even 4 columns of 32-bit integers.  This might yield better compression.
138        self.child.compute_stat();
139    }
140}
141
142impl ComputeStat for OpaqueBlock {
143    fn compute_stat(&mut self) {
144        // compute this datablock's data_size
145        let data_size = self.data_size();
146        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
147        let mut info = self.block_info.0.write().unwrap();
148        info.insert(Stat::DataSize, data_size_array);
149    }
150}
151
152pub trait GetStat: fmt::Debug {
153    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>>;
154
155    fn expect_stat(&self, stat: Stat) -> Arc<dyn Array> {
156        self.get_stat(stat)
157            .unwrap_or_else(|| panic!("{:?} DataBlock does not have `{}` statistics.", self, stat))
158    }
159
160    fn expect_single_stat<T: ArrowPrimitiveType>(&self, stat: Stat) -> T::Native {
161        let stat_value = self.expect_stat(stat);
162        let stat_value = stat_value.as_primitive::<T>();
163        if stat_value.len() != 1 {
164            panic!(
165                "{:?} DataBlock does not have exactly one value for `{} statistics.",
166                self, stat
167            );
168        }
169        stat_value.value(0)
170    }
171}
172
173impl GetStat for DataBlock {
174    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
175        match self {
176            Self::Empty() => None,
177            Self::Constant(_) => None,
178            Self::AllNull(data_block) => data_block.get_stat(stat),
179            Self::Nullable(data_block) => data_block.get_stat(stat),
180            Self::FixedWidth(data_block) => data_block.get_stat(stat),
181            Self::FixedSizeList(data_block) => data_block.get_stat(stat),
182            Self::VariableWidth(data_block) => data_block.get_stat(stat),
183            Self::Opaque(data_block) => data_block.get_stat(stat),
184            Self::Struct(data_block) => data_block.get_stat(stat),
185            Self::Dictionary(data_block) => data_block.get_stat(stat),
186        }
187    }
188}
189
190// NullableDataBlock will be deprecated in Lance 2.1.
191impl GetStat for NullableDataBlock {
192    // This function simply returns the statistics of the inner `DataBlock` of `NullableDataBlock`,
193    // this is not accurate but `NullableDataBlock` is going to be deprecated in Lance 2.1 anyway.
194    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
195        self.data.get_stat(stat)
196    }
197}
198
199impl GetStat for VariableWidthBlock {
200    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
201        let block_info = self.block_info.0.read().unwrap();
202
203        if block_info.is_empty() {
204            panic!("get_stat should be called after statistics are computed.");
205        }
206        block_info.get(&stat).cloned()
207    }
208}
209
210impl GetStat for FixedSizeListBlock {
211    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
212        let child_stat = self.child.get_stat(stat);
213        match stat {
214            Stat::MaxLength => child_stat.map(|max_length| {
215                // this is conservative when working with variable length data as we shouldn't assume
216                // that we have a list of all max-length elements but it's cheap and easy to calculate
217                let max_length = max_length.as_primitive::<UInt64Type>().value(0);
218                Arc::new(UInt64Array::from(vec![max_length * self.dimension])) as Arc<dyn Array>
219            }),
220            _ => child_stat,
221        }
222    }
223}
224
225impl VariableWidthBlock {
226    // Caveat: the computation here assumes VariableWidthBlock.offsets maps directly to VariableWidthBlock.data
227    // without any adjustment(for example, no null_adjustment for offsets)
228    fn cardinality(&mut self) -> Arc<dyn Array> {
229        const PRECISION: u8 = 4;
230        // The default hasher (currently sip hash 1-3) does not seem to give good results
231        // with HLL.
232        //
233        // In particular, when using randomly generated 12-byte strings, the HLL count was
234        // suggested a cardinality of 500 (out of 1000 unique items and hashes) at least 10%
235        // of the time.
236        //
237        // Using xxhash3 consistently gives better results.
238        let mut hll: HyperLogLogPlus<&[u8], xxhash_rust::xxh3::Xxh3Builder> =
239            HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()).unwrap();
240
241        match self.bits_per_offset {
242            32 => {
243                let offsets_ref = self.offsets.borrow_to_typed_slice::<u32>();
244                let offsets: &[u32] = offsets_ref.as_ref();
245
246                offsets
247                    .iter()
248                    .zip(offsets.iter().skip(1))
249                    .for_each(|(&start, &end)| {
250                        hll.insert(&self.data[start as usize..end as usize]);
251                    });
252                let cardinality = hll.count() as u64;
253                Arc::new(UInt64Array::from(vec![cardinality]))
254            }
255            64 => {
256                let offsets_ref = self.offsets.borrow_to_typed_slice::<u64>();
257                let offsets: &[u64] = offsets_ref.as_ref();
258
259                offsets
260                    .iter()
261                    .zip(offsets.iter().skip(1))
262                    .for_each(|(&start, &end)| {
263                        hll.insert(&self.data[start as usize..end as usize]);
264                    });
265
266                let cardinality = hll.count() as u64;
267                Arc::new(UInt64Array::from(vec![cardinality]))
268            }
269            _ => {
270                unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64")
271            }
272        }
273    }
274
275    fn max_length(&mut self) -> Arc<dyn Array> {
276        match self.bits_per_offset {
277            32 => {
278                let offsets = self.offsets.borrow_to_typed_slice::<u32>();
279                let offsets = offsets.as_ref();
280                let max_len = offsets
281                    .windows(2)
282                    .map(|pair| pair[1] - pair[0])
283                    .max()
284                    .unwrap_or(0);
285                Arc::new(UInt64Array::from(vec![max_len as u64]))
286            }
287            64 => {
288                let offsets = self.offsets.borrow_to_typed_slice::<u64>();
289                let offsets = offsets.as_ref();
290                let max_len = offsets
291                    .windows(2)
292                    .map(|pair| pair[1] - pair[0])
293                    .max()
294                    .unwrap_or(0);
295                Arc::new(UInt64Array::from(vec![max_len]))
296            }
297            _ => {
298                unreachable!("the type of offsets in VariableWidth can only be u32 or u64");
299            }
300        }
301    }
302}
303
304impl GetStat for AllNullDataBlock {
305    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
306        match stat {
307            Stat::NullCount => {
308                let null_count = self.num_values;
309                Some(Arc::new(UInt64Array::from(vec![null_count])))
310            }
311            Stat::DataSize => Some(Arc::new(UInt64Array::from(vec![0]))),
312            _ => None,
313        }
314    }
315}
316
317impl GetStat for FixedWidthDataBlock {
318    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
319        let block_info = self.block_info.0.read().unwrap();
320
321        if block_info.is_empty() {
322            panic!("get_stat should be called after statistics are computed.");
323        }
324        block_info.get(&stat).cloned()
325    }
326}
327
328impl FixedWidthDataBlock {
329    fn max_bit_widths(&mut self) -> Arc<dyn Array> {
330        if self.num_values == 0 {
331            return Arc::new(UInt64Array::from(vec![0u64]));
332        }
333
334        const CHUNK_SIZE: usize = 1024;
335
336        fn calculate_max_bit_width<T: PrimInt>(slice: &[T], bits_per_value: u64) -> Vec<u64> {
337            slice
338                .chunks(CHUNK_SIZE)
339                .map(|chunk| {
340                    let max_value = chunk.iter().fold(T::zero(), |acc, &x| acc | x);
341                    bits_per_value - max_value.leading_zeros() as u64
342                })
343                .collect()
344        }
345
346        match self.bits_per_value {
347            8 => {
348                let u8_slice = self.data.borrow_to_typed_slice::<u8>();
349                let u8_slice = u8_slice.as_ref();
350                Arc::new(UInt64Array::from(calculate_max_bit_width(
351                    u8_slice,
352                    self.bits_per_value,
353                )))
354            }
355            16 => {
356                let u16_slice = self.data.borrow_to_typed_slice::<u16>();
357                let u16_slice = u16_slice.as_ref();
358                Arc::new(UInt64Array::from(calculate_max_bit_width(
359                    u16_slice,
360                    self.bits_per_value,
361                )))
362            }
363            32 => {
364                let u32_slice = self.data.borrow_to_typed_slice::<u32>();
365                let u32_slice = u32_slice.as_ref();
366                Arc::new(UInt64Array::from(calculate_max_bit_width(
367                    u32_slice,
368                    self.bits_per_value,
369                )))
370            }
371            64 => {
372                let u64_slice = self.data.borrow_to_typed_slice::<u64>();
373                let u64_slice = u64_slice.as_ref();
374                Arc::new(UInt64Array::from(calculate_max_bit_width(
375                    u64_slice,
376                    self.bits_per_value,
377                )))
378            }
379            _ => Arc::new(UInt64Array::from(vec![self.bits_per_value])),
380        }
381    }
382
383    fn cardinality(&mut self) -> Arc<dyn Array> {
384        match self.bits_per_value {
385            128 => {
386                let u128_slice_ref = self.data.borrow_to_typed_slice::<u128>();
387                let u128_slice = u128_slice_ref.as_ref();
388
389                const PRECISION: u8 = 4;
390                let mut hll: HyperLogLogPlus<u128, RandomState> =
391                    HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
392                for val in u128_slice {
393                    hll.insert(val);
394                }
395                let cardinality = hll.count() as u64;
396                Arc::new(UInt64Array::from(vec![cardinality]))
397            }
398            _ => unreachable!(),
399        }
400    }
401
402    /// Counts the number of runs (consecutive sequences of equal values) in the data.
403    ///
404    /// A "run" is defined as a sequence of one or more consecutive equal values.
405    /// For example:
406    /// - `[1, 1, 2, 2, 2, 3]` has 3 runs: [1,1], [2,2,2], and [3]
407    /// - `[1, 2, 3, 4]` has 4 runs (each value is its own run)
408    /// - `[5, 5, 5, 5]` has 1 run
409    ///
410    /// This count is used to determine if RLE compression would be effective.
411    /// Fewer runs relative to the total number of values indicates better RLE compression potential.
412    fn run_count(&mut self) -> Arc<dyn Array> {
413        if self.num_values == 0 {
414            return Arc::new(UInt64Array::from(vec![0u64]));
415        }
416
417        // Inner function to count runs in typed data
418        fn count_runs<T: PartialEq + Copy>(slice: &[T]) -> u64 {
419            if slice.is_empty() {
420                return 0;
421            }
422
423            // Start with 1 run (the first value)
424            let mut runs = 1u64;
425            let mut prev = slice[0];
426
427            // Count value transitions (each transition indicates a new run)
428            for &val in &slice[1..] {
429                if val != prev {
430                    runs += 1;
431                    prev = val;
432                }
433            }
434
435            runs
436        }
437
438        let run_count = match self.bits_per_value {
439            8 => {
440                let u8_slice = self.data.borrow_to_typed_slice::<u8>();
441                count_runs(u8_slice.as_ref())
442            }
443            16 => {
444                let u16_slice = self.data.borrow_to_typed_slice::<u16>();
445                count_runs(u16_slice.as_ref())
446            }
447            32 => {
448                let u32_slice = self.data.borrow_to_typed_slice::<u32>();
449                count_runs(u32_slice.as_ref())
450            }
451            64 => {
452                let u64_slice = self.data.borrow_to_typed_slice::<u64>();
453                count_runs(u64_slice.as_ref())
454            }
455            128 => {
456                let u128_slice = self.data.borrow_to_typed_slice::<u128>();
457                count_runs(u128_slice.as_ref())
458            }
459            _ => self.num_values, // For other bit widths, assume no runs
460        };
461
462        Arc::new(UInt64Array::from(vec![run_count]))
463    }
464
465    /// Calculates entropy for each byte position.
466    /// Returns an array with entropy values for each byte position (scaled by 1000 for integer storage).
467    /// Lower entropy in specific byte positions indicates better suitability for BSS.
468    fn byte_position_entropy(&mut self) -> Arc<dyn Array> {
469        const SAMPLE_SIZE: usize = 64; // Sample more values for better entropy estimation
470
471        // Get sample size (min of data length and SAMPLE_SIZE)
472        let sample_count = (self.num_values as usize).min(SAMPLE_SIZE);
473
474        if sample_count == 0 {
475            // Return empty array for empty data
476            return Arc::new(UInt64Array::from(vec![] as Vec<u64>));
477        }
478
479        let bytes_per_value = (self.bits_per_value / 8) as usize;
480        let mut entropies = Vec::with_capacity(bytes_per_value);
481
482        // Calculate entropy for each byte position
483        for pos in 0..bytes_per_value {
484            let mut byte_counts = [0u32; 256];
485
486            // Count occurrences of each byte value at this position
487            for i in 0..sample_count {
488                let byte_offset = i * bytes_per_value + pos;
489                if byte_offset < self.data.len() {
490                    byte_counts[self.data[byte_offset] as usize] += 1;
491                }
492            }
493
494            // Calculate Shannon entropy for this position
495            let mut entropy = 0.0f64;
496            let total = sample_count as f64;
497
498            for &count in &byte_counts {
499                if count > 0 {
500                    let p = count as f64 / total;
501                    entropy -= p * p.log2();
502                }
503            }
504
505            // Scale by 1000 and store as integer for efficient storage
506            entropies.push((entropy * 1000.0) as u64);
507        }
508
509        Arc::new(UInt64Array::from(entropies))
510    }
511}
512
513impl GetStat for OpaqueBlock {
514    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
515        let block_info = self.block_info.0.read().unwrap();
516
517        if block_info.is_empty() {
518            panic!("get_stat should be called after statistics are computed.");
519        }
520        block_info.get(&stat).cloned()
521    }
522}
523
524impl GetStat for DictionaryDataBlock {
525    fn get_stat(&self, _stat: Stat) -> Option<Arc<dyn Array>> {
526        None
527    }
528}
529
530impl GetStat for StructDataBlock {
531    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
532        let block_info = self.block_info.0.read().unwrap();
533        if block_info.is_empty() {
534            panic!("get_stat should be called after statistics are computed.")
535        }
536        block_info.get(&stat).cloned()
537    }
538}
539
540impl ComputeStat for StructDataBlock {
541    fn compute_stat(&mut self) {
542        let data_size = self.data_size();
543        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
544
545        let max_len = self
546            .children
547            .iter()
548            .map(|child| child.expect_single_stat::<UInt64Type>(Stat::MaxLength))
549            .sum::<u64>();
550        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
551
552        let mut info = self.block_info.0.write().unwrap();
553        info.insert(Stat::DataSize, data_size_array);
554        info.insert(Stat::MaxLength, max_len_array);
555    }
556}
557
558#[cfg(test)]
559mod tests {
560    use std::sync::Arc;
561
562    use arrow_array::{
563        ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
564        UInt16Array, UInt32Array, UInt64Array, UInt8Array,
565    };
566    use arrow_schema::{DataType, Field};
567    use lance_arrow::DataTypeExt;
568    use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED};
569    use rand::SeedableRng;
570
571    use crate::statistics::{GetStat, Stat};
572
573    use super::DataBlock;
574
575    use arrow_array::{
576        cast::AsArray,
577        types::{Int32Type, UInt64Type},
578        Array,
579    };
580    use arrow_select::concat::concat;
581    #[test]
582    fn test_data_size_stat() {
583        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
584        let mut genn = array::rand::<Int32Type>().with_nulls(&[false, false, false]);
585        let arr1 = genn.generate(RowCount::from(3), &mut rng).unwrap();
586        let arr2 = genn.generate(RowCount::from(3), &mut rng).unwrap();
587        let arr3 = genn.generate(RowCount::from(3), &mut rng).unwrap();
588        let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9);
589
590        let concatenated_array = concat(&[
591            &*Arc::new(arr1.clone()) as &dyn Array,
592            &*Arc::new(arr2.clone()) as &dyn Array,
593            &*Arc::new(arr3.clone()) as &dyn Array,
594        ])
595        .unwrap();
596
597        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
598
599        let total_buffer_size: usize = concatenated_array
600            .to_data()
601            .buffers()
602            .iter()
603            .map(|buffer| buffer.len())
604            .sum();
605        assert!(data_size == total_buffer_size as u64);
606
607        // test DataType::Binary
608        let mut genn = lance_datagen::array::rand_type(&DataType::Binary);
609        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
610        let block = DataBlock::from_array(arr.clone());
611        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
612
613        let total_buffer_size: usize = arr
614            .to_data()
615            .buffers()
616            .iter()
617            .map(|buffer| buffer.len())
618            .sum();
619        assert!(data_size == total_buffer_size as u64);
620
621        // test DataType::Struct
622        let fields = vec![
623            Arc::new(Field::new("int_field", DataType::Int32, false)),
624            Arc::new(Field::new("float_field", DataType::Float32, false)),
625        ]
626        .into();
627
628        let mut genn = lance_datagen::array::rand_type(&DataType::Struct(fields));
629        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
630        let block = DataBlock::from_array(arr.clone());
631        let (_, arr_parts, _) = arr.as_struct().clone().into_parts();
632        let total_buffer_size: usize = arr_parts
633            .iter()
634            .map(|arr| {
635                arr.to_data()
636                    .buffers()
637                    .iter()
638                    .map(|buffer| buffer.len())
639                    .sum::<usize>()
640            })
641            .sum();
642        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
643        assert!(data_size == total_buffer_size as u64);
644
645        // test DataType::Dictionary
646        let mut genn = array::rand_type(&DataType::Dictionary(
647            Box::new(DataType::Int32),
648            Box::new(DataType::Utf8),
649        ));
650        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
651        let block = DataBlock::from_array(arr.clone());
652        assert!(block.get_stat(Stat::DataSize).is_none());
653
654        let mut genn = array::rand::<Int32Type>().with_nulls(&[false, true, false]);
655        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
656        let block = DataBlock::from_array(arr.clone());
657        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
658        let total_buffer_size: usize = arr
659            .to_data()
660            .buffers()
661            .iter()
662            .map(|buffer| buffer.len())
663            .sum();
664
665        assert!(data_size == total_buffer_size as u64);
666    }
667
668    #[test]
669    fn test_bit_width_stat_for_integers() {
670        let int8_array = Int8Array::from(vec![1, 2, 3]);
671        let array_ref: ArrayRef = Arc::new(int8_array);
672        let block = DataBlock::from_array(array_ref);
673
674        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
675        let actual_bit_width = block.expect_stat(Stat::BitWidth);
676
677        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
678
679        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
680        let array_ref: ArrayRef = Arc::new(int8_array);
681        let block = DataBlock::from_array(array_ref);
682
683        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
684        let actual_bit_width = block.expect_stat(Stat::BitWidth);
685        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
686
687        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
688        let array_ref: ArrayRef = Arc::new(int8_array);
689        let block = DataBlock::from_array(array_ref);
690
691        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
692        let actual_bit_width = block.expect_stat(Stat::BitWidth);
693        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
694
695        let int8_array = Int8Array::from(vec![-1, 2, 3]);
696        let array_ref: ArrayRef = Arc::new(int8_array);
697        let block = DataBlock::from_array(array_ref);
698
699        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
700        let actual_bit_width = block.expect_stat(Stat::BitWidth);
701        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
702
703        let int16_array = Int16Array::from(vec![1, 2, 3]);
704        let array_ref: ArrayRef = Arc::new(int16_array);
705        let block = DataBlock::from_array(array_ref);
706
707        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
708        let actual_bit_width = block.expect_stat(Stat::BitWidth);
709        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
710
711        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
712        let array_ref: ArrayRef = Arc::new(int16_array);
713        let block = DataBlock::from_array(array_ref);
714
715        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
716        let actual_bit_width = block.expect_stat(Stat::BitWidth);
717        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
718
719        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
720        let array_ref: ArrayRef = Arc::new(int16_array);
721        let block = DataBlock::from_array(array_ref);
722
723        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
724        let actual_bit_width = block.expect_stat(Stat::BitWidth);
725        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
726
727        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
728        let array_ref: ArrayRef = Arc::new(int16_array);
729        let block = DataBlock::from_array(array_ref);
730
731        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
732        let actual_bit_width = block.expect_stat(Stat::BitWidth);
733        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
734
735        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
736        let array_ref: ArrayRef = Arc::new(int16_array);
737        let block = DataBlock::from_array(array_ref);
738
739        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
740        let actual_bit_width = block.expect_stat(Stat::BitWidth);
741        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
742
743        let int16_array = Int16Array::from(vec![-1, 2, 3]);
744        let array_ref: ArrayRef = Arc::new(int16_array);
745        let block = DataBlock::from_array(array_ref);
746
747        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
748        let actual_bit_width = block.expect_stat(Stat::BitWidth);
749        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
750
751        let int32_array = Int32Array::from(vec![1, 2, 3]);
752        let array_ref: ArrayRef = Arc::new(int32_array);
753        let block = DataBlock::from_array(array_ref);
754
755        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
756        let actual_bit_width = block.expect_stat(Stat::BitWidth);
757        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
758
759        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
760        let array_ref: ArrayRef = Arc::new(int32_array);
761        let block = DataBlock::from_array(array_ref);
762
763        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
764        let actual_bit_width = block.expect_stat(Stat::BitWidth);
765        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
766
767        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
768        let array_ref: ArrayRef = Arc::new(int32_array);
769        let block = DataBlock::from_array(array_ref);
770
771        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
772        let actual_bit_width = block.expect_stat(Stat::BitWidth);
773        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
774
775        let int32_array = Int32Array::from(vec![-1, 2, 3]);
776        let array_ref: ArrayRef = Arc::new(int32_array);
777        let block = DataBlock::from_array(array_ref);
778
779        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
780        let actual_bit_width = block.expect_stat(Stat::BitWidth);
781        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
782
783        let int32_array = Int32Array::from(vec![-1, 2, 3, -88]);
784        let array_ref: ArrayRef = Arc::new(int32_array);
785        let block = DataBlock::from_array(array_ref);
786
787        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
788        let actual_bit_width = block.expect_stat(Stat::BitWidth);
789        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
790
791        let int64_array = Int64Array::from(vec![1, 2, 3]);
792        let array_ref: ArrayRef = Arc::new(int64_array);
793        let block = DataBlock::from_array(array_ref);
794
795        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
796        let actual_bit_width = block.expect_stat(Stat::BitWidth);
797        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
798
799        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
800        let array_ref: ArrayRef = Arc::new(int64_array);
801        let block = DataBlock::from_array(array_ref);
802
803        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
804        let actual_bit_width = block.expect_stat(Stat::BitWidth);
805        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
806
807        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
808        let array_ref: ArrayRef = Arc::new(int64_array);
809        let block = DataBlock::from_array(array_ref);
810
811        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
812        let actual_bit_width = block.expect_stat(Stat::BitWidth);
813        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
814
815        let int64_array = Int64Array::from(vec![-1, 2, 3]);
816        let array_ref: ArrayRef = Arc::new(int64_array);
817        let block = DataBlock::from_array(array_ref);
818
819        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
820        let actual_bit_width = block.expect_stat(Stat::BitWidth);
821        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
822
823        let int64_array = Int64Array::from(vec![-1, 2, 3, -88]);
824        let array_ref: ArrayRef = Arc::new(int64_array);
825        let block = DataBlock::from_array(array_ref);
826
827        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
828        let actual_bit_width = block.expect_stat(Stat::BitWidth);
829        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
830
831        let uint8_array = UInt8Array::from(vec![1, 2, 3]);
832        let array_ref: ArrayRef = Arc::new(uint8_array);
833        let block = DataBlock::from_array(array_ref);
834
835        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
836        let actual_bit_width = block.expect_stat(Stat::BitWidth);
837        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
838
839        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
840        let array_ref: ArrayRef = Arc::new(uint8_array);
841        let block = DataBlock::from_array(array_ref);
842
843        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
844        let actual_bit_width = block.expect_stat(Stat::BitWidth);
845        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
846
847        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
848        let array_ref: ArrayRef = Arc::new(uint8_array);
849        let block = DataBlock::from_array(array_ref);
850
851        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
852        let actual_bit_width = block.expect_stat(Stat::BitWidth);
853        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
854
855        let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]);
856        let array_ref: ArrayRef = Arc::new(uint8_array);
857        let block = DataBlock::from_array(array_ref);
858
859        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
860        let actual_bit_width = block.expect_stat(Stat::BitWidth);
861        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
862
863        let uint16_array = UInt16Array::from(vec![1, 2, 3]);
864        let array_ref: ArrayRef = Arc::new(uint16_array);
865        let block = DataBlock::from_array(array_ref);
866
867        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
868        let actual_bit_width = block.expect_stat(Stat::BitWidth);
869        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
870
871        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
872        let array_ref: ArrayRef = Arc::new(uint16_array);
873        let block = DataBlock::from_array(array_ref);
874
875        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
876        let actual_bit_width = block.expect_stat(Stat::BitWidth);
877        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
878
879        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
880        let array_ref: ArrayRef = Arc::new(uint16_array);
881        let block = DataBlock::from_array(array_ref);
882
883        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
884        let actual_bit_width = block.expect_stat(Stat::BitWidth);
885        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
886
887        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
888        let array_ref: ArrayRef = Arc::new(uint16_array);
889        let block = DataBlock::from_array(array_ref);
890
891        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
892        let actual_bit_width = block.expect_stat(Stat::BitWidth);
893        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
894
895        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
896        let array_ref: ArrayRef = Arc::new(uint16_array);
897        let block = DataBlock::from_array(array_ref);
898
899        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
900        let actual_bit_width = block.expect_stat(Stat::BitWidth);
901        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
902
903        let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]);
904        let array_ref: ArrayRef = Arc::new(uint16_array);
905        let block = DataBlock::from_array(array_ref);
906
907        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
908        let actual_bit_width = block.expect_stat(Stat::BitWidth);
909        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
910
911        let uint32_array = UInt32Array::from(vec![1, 2, 3]);
912        let array_ref: ArrayRef = Arc::new(uint32_array);
913        let block = DataBlock::from_array(array_ref);
914
915        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
916        let actual_bit_width = block.expect_stat(Stat::BitWidth);
917        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
918
919        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
920        let array_ref: ArrayRef = Arc::new(uint32_array);
921        let block = DataBlock::from_array(array_ref);
922
923        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
924        let actual_bit_width = block.expect_stat(Stat::BitWidth);
925        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
926
927        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
928        let array_ref: ArrayRef = Arc::new(uint32_array);
929        let block = DataBlock::from_array(array_ref);
930
931        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
932        let actual_bit_width = block.expect_stat(Stat::BitWidth);
933        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
934
935        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]);
936        let array_ref: ArrayRef = Arc::new(uint32_array);
937        let block = DataBlock::from_array(array_ref);
938
939        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
940        let actual_bit_width = block.expect_stat(Stat::BitWidth);
941        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
942
943        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]);
944        let array_ref: ArrayRef = Arc::new(uint32_array);
945        let block = DataBlock::from_array(array_ref);
946
947        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
948        let actual_bit_width = block.expect_stat(Stat::BitWidth);
949        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
950
951        let uint64_array = UInt64Array::from(vec![1, 2, 3]);
952        let array_ref: ArrayRef = Arc::new(uint64_array);
953        let block = DataBlock::from_array(array_ref);
954
955        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
956        let actual_bit_width = block.expect_stat(Stat::BitWidth);
957        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
958
959        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
960        let array_ref: ArrayRef = Arc::new(uint64_array);
961        let block = DataBlock::from_array(array_ref);
962
963        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
964        let actual_bit_width = block.expect_stat(Stat::BitWidth);
965        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
966
967        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
968        let array_ref: ArrayRef = Arc::new(uint64_array);
969        let block = DataBlock::from_array(array_ref);
970
971        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
972        let actual_bit_width = block.expect_stat(Stat::BitWidth);
973        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
974
975        let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]);
976        let array_ref: ArrayRef = Arc::new(uint64_array);
977        let block = DataBlock::from_array(array_ref);
978
979        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
980        let actual_bit_width = block.expect_stat(Stat::BitWidth);
981        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
982
983        let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]);
984        let array_ref: ArrayRef = Arc::new(uint64_array);
985        let block = DataBlock::from_array(array_ref);
986
987        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
988        let actual_bit_width = block.expect_stat(Stat::BitWidth);
989        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
990    }
991
992    #[test]
993    fn test_bit_width_stat_more_than_1024() {
994        for data_type in [
995            DataType::Int8,
996            DataType::Int16,
997            DataType::Int32,
998            DataType::Int64,
999        ] {
1000            let array1 = Int64Array::from(vec![3; 1024]);
1001            let array2 = Int64Array::from(vec![8; 1024]);
1002            let array3 = Int64Array::from(vec![-1; 10]);
1003            let array1 = arrow_cast::cast(&array1, &data_type).unwrap();
1004            let array2 = arrow_cast::cast(&array2, &data_type).unwrap();
1005            let array3 = arrow_cast::cast(&array3, &data_type).unwrap();
1006
1007            let arrays: Vec<&dyn arrow_array::Array> =
1008                vec![array1.as_ref(), array2.as_ref(), array3.as_ref()];
1009            let concatenated = concat(&arrays).unwrap();
1010            let block = DataBlock::from_array(concatenated.clone());
1011
1012            let expected_bit_width = Arc::new(UInt64Array::from(vec![
1013                2,
1014                4,
1015                (data_type.byte_width() * 8) as u64,
1016            ])) as ArrayRef;
1017            let actual_bit_widths = block.expect_stat(Stat::BitWidth);
1018            assert_eq!(actual_bit_widths.as_ref(), expected_bit_width.as_ref(),);
1019        }
1020    }
1021
1022    #[test]
1023    fn test_bit_width_when_none() {
1024        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
1025        let mut genn = lance_datagen::array::rand_type(&DataType::Binary);
1026        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
1027        let block = DataBlock::from_array(arr.clone());
1028        assert!(block.get_stat(Stat::BitWidth).is_none(),);
1029    }
1030
1031    #[test]
1032    fn test_cardinality_variable_width_datablock() {
1033        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
1034        let block = DataBlock::from_array(string_array);
1035        let expected_cardinality = 2;
1036        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1037        assert_eq!(actual_cardinality, expected_cardinality,);
1038
1039        let string_array = StringArray::from(vec![
1040            Some("to be named by variables"),
1041            Some("to be passed as arguments to procedures"),
1042            Some("to be returned as values of procedures"),
1043        ]);
1044        let block = DataBlock::from_array(string_array);
1045        let expected_cardinality = 3;
1046        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1047
1048        assert_eq!(actual_cardinality, expected_cardinality,);
1049
1050        let string_array = StringArray::from(vec![
1051            Some("Samuel Eilenberg"),
1052            Some("Saunders Mac Lane"),
1053            Some("Samuel Eilenberg"),
1054        ]);
1055        let block = DataBlock::from_array(string_array);
1056        let expected_cardinality = 2;
1057        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1058        assert_eq!(actual_cardinality, expected_cardinality,);
1059
1060        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
1061        let block = DataBlock::from_array(string_array);
1062        let expected_cardinality = 2;
1063        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1064        assert_eq!(actual_cardinality, expected_cardinality,);
1065
1066        let string_array = LargeStringArray::from(vec![
1067            Some("to be named by variables"),
1068            Some("to be passed as arguments to procedures"),
1069            Some("to be returned as values of procedures"),
1070        ]);
1071        let block = DataBlock::from_array(string_array);
1072        let expected_cardinality = 3;
1073        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1074        assert_eq!(actual_cardinality, expected_cardinality,);
1075
1076        let string_array = LargeStringArray::from(vec![
1077            Some("Samuel Eilenberg"),
1078            Some("Saunders Mac Lane"),
1079            Some("Samuel Eilenberg"),
1080        ]);
1081        let block = DataBlock::from_array(string_array);
1082        let expected_cardinality = 2;
1083        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1084        assert_eq!(actual_cardinality, expected_cardinality,);
1085    }
1086
1087    #[test]
1088    fn test_max_length_variable_width_datablock() {
1089        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
1090        let block = DataBlock::from_array(string_array.clone());
1091        let expected_max_length = string_array.value_length(0) as u64;
1092        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1093        assert_eq!(actual_max_length, expected_max_length);
1094
1095        let string_array = StringArray::from(vec![
1096            Some("to be named by variables"),
1097            Some("to be passed as arguments to procedures"), // string that has max length
1098            Some("to be returned as values of procedures"),
1099        ]);
1100        let block = DataBlock::from_array(string_array.clone());
1101        let expected_max_length = string_array.value_length(1) as u64;
1102        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1103        assert_eq!(actual_max_length, expected_max_length);
1104
1105        let string_array = StringArray::from(vec![
1106            Some("Samuel Eilenberg"),
1107            Some("Saunders Mac Lane"), // string that has max length
1108            Some("Samuel Eilenberg"),
1109        ]);
1110        let block = DataBlock::from_array(string_array.clone());
1111        let expected_max_length = string_array.value_length(1) as u64;
1112        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1113        assert_eq!(actual_max_length, expected_max_length);
1114
1115        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
1116        let block = DataBlock::from_array(string_array.clone());
1117        let expected_max_length = string_array.value_length(1) as u64;
1118        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1119        assert_eq!(actual_max_length, expected_max_length);
1120
1121        let string_array = LargeStringArray::from(vec![
1122            Some("to be named by variables"),
1123            Some("to be passed as arguments to procedures"), // string that has max length
1124            Some("to be returned as values of procedures"),
1125        ]);
1126        let block = DataBlock::from_array(string_array.clone());
1127        let expected_max_length = string_array.value(1).len() as u64;
1128        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1129
1130        assert_eq!(actual_max_length, expected_max_length);
1131    }
1132
1133    #[test]
1134    fn test_run_count_stat() {
1135        // Test with highly repetitive data
1136        let int32_array = Int32Array::from(vec![1, 1, 1, 2, 2, 2, 3, 3, 3]);
1137        let block = DataBlock::from_array(int32_array);
1138        let expected_run_count = 3;
1139        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1140        assert_eq!(actual_run_count, expected_run_count);
1141
1142        // Test with no repetition
1143        let int32_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
1144        let block = DataBlock::from_array(int32_array);
1145        let expected_run_count = 5;
1146        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1147        assert_eq!(actual_run_count, expected_run_count);
1148
1149        // Test with mixed pattern
1150        let int32_array = Int32Array::from(vec![1, 1, 2, 3, 3, 3, 4, 5, 5]);
1151        let block = DataBlock::from_array(int32_array);
1152        let expected_run_count = 5;
1153        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1154        assert_eq!(actual_run_count, expected_run_count);
1155
1156        // Test with single value
1157        let int32_array = Int32Array::from(vec![42, 42, 42, 42, 42]);
1158        let block = DataBlock::from_array(int32_array);
1159        let expected_run_count = 1;
1160        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1161        assert_eq!(actual_run_count, expected_run_count);
1162
1163        // Test with different data types
1164        let uint8_array = UInt8Array::from(vec![1, 1, 2, 2, 3, 3]);
1165        let block = DataBlock::from_array(uint8_array);
1166        let expected_run_count = 3;
1167        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1168        assert_eq!(actual_run_count, expected_run_count);
1169
1170        let int64_array = Int64Array::from(vec![100, 100, 200, 300, 300]);
1171        let block = DataBlock::from_array(int64_array);
1172        let expected_run_count = 3;
1173        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1174        assert_eq!(actual_run_count, expected_run_count);
1175    }
1176}