lance_encoding/
statistics.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{
5    fmt::{self},
6    hash::{Hash, RandomState},
7    sync::Arc,
8};
9
10use arrow::{array::AsArray, datatypes::UInt64Type};
11use arrow_array::{Array, ArrowPrimitiveType, UInt64Array};
12use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
13use num_traits::PrimInt;
14
15use crate::data::{
16    AllNullDataBlock, DataBlock, DictionaryDataBlock, FixedSizeListBlock, FixedWidthDataBlock,
17    NullableDataBlock, OpaqueBlock, StructDataBlock, VariableWidthBlock,
18};
19
20#[derive(Clone, Copy, PartialEq, Eq, Hash)]
21pub enum Stat {
22    BitWidth,
23    DataSize,
24    Cardinality,
25    FixedSize,
26    NullCount,
27    MaxLength,
28    RunCount,
29    BytePositionEntropy,
30}
31
32impl fmt::Debug for Stat {
33    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
34        match self {
35            Self::BitWidth => write!(f, "BitWidth"),
36            Self::DataSize => write!(f, "DataSize"),
37            Self::Cardinality => write!(f, "Cardinality"),
38            Self::FixedSize => write!(f, "FixedSize"),
39            Self::NullCount => write!(f, "NullCount"),
40            Self::MaxLength => write!(f, "MaxLength"),
41            Self::RunCount => write!(f, "RunCount"),
42            Self::BytePositionEntropy => write!(f, "BytePositionEntropy"),
43        }
44    }
45}
46
47impl fmt::Display for Stat {
48    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
49        write!(f, "{:?}", self)
50    }
51}
52
53pub trait ComputeStat {
54    fn compute_stat(&mut self);
55}
56
57impl ComputeStat for DataBlock {
58    fn compute_stat(&mut self) {
59        match self {
60            Self::Empty() => {}
61            Self::Constant(_) => {}
62            Self::AllNull(_) => {}
63            Self::Nullable(data_block) => data_block.data.compute_stat(),
64            Self::FixedWidth(data_block) => data_block.compute_stat(),
65            Self::FixedSizeList(data_block) => data_block.compute_stat(),
66            Self::VariableWidth(data_block) => data_block.compute_stat(),
67            Self::Opaque(data_block) => data_block.compute_stat(),
68            Self::Struct(data_block) => data_block.compute_stat(),
69            Self::Dictionary(_) => {}
70        }
71    }
72}
73
74impl ComputeStat for VariableWidthBlock {
75    fn compute_stat(&mut self) {
76        if !self.block_info.0.read().unwrap().is_empty() {
77            panic!("compute_stat should only be called once during DataBlock construction");
78        }
79        let data_size = self.data_size();
80        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
81
82        let cardinality_array = self.cardinality();
83
84        let max_length_array = self.max_length();
85
86        let mut info = self.block_info.0.write().unwrap();
87        info.insert(Stat::DataSize, data_size_array);
88        info.insert(Stat::Cardinality, cardinality_array);
89        info.insert(Stat::MaxLength, max_length_array);
90    }
91}
92
93impl ComputeStat for FixedWidthDataBlock {
94    fn compute_stat(&mut self) {
95        // compute this datablock's data_size
96        let data_size = self.data_size();
97        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
98
99        // compute this datablock's max_bit_width
100        let max_bit_widths = self.max_bit_widths();
101
102        // the MaxLength of FixedWidthDataBlock is it's self.bits_per_value / 8
103        let max_len = self.bits_per_value / 8;
104        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
105
106        let cardidinality_array = if self.bits_per_value == 128 {
107            Some(self.cardinality())
108        } else {
109            None
110        };
111
112        // compute run count
113        let run_count_array = self.run_count();
114
115        // compute byte position entropy
116        let byte_position_entropy = self.byte_position_entropy();
117
118        let mut info = self.block_info.0.write().unwrap();
119        info.insert(Stat::DataSize, data_size_array);
120        info.insert(Stat::BitWidth, max_bit_widths);
121        info.insert(Stat::MaxLength, max_len_array);
122        info.insert(Stat::RunCount, run_count_array);
123        info.insert(Stat::BytePositionEntropy, byte_position_entropy);
124        if let Some(cardinality_array) = cardidinality_array {
125            info.insert(Stat::Cardinality, cardinality_array);
126        }
127    }
128}
129
130impl ComputeStat for FixedSizeListBlock {
131    fn compute_stat(&mut self) {
132        // We leave the child stats unchanged.  This may seem odd (e.g. should bit width be the
133        // bit width of the child * dimension?) but it's because we use these stats to determine
134        // compression and we are currently just compressing the child data.
135        //
136        // There is a potential opportunity here to do better.  For example, if we have a FSL of
137        // 4 32-bit integers then we should probably treat them as a single 128-bit integer or maybe
138        // even 4 columns of 32-bit integers.  This might yield better compression.
139        self.child.compute_stat();
140    }
141}
142
143impl ComputeStat for OpaqueBlock {
144    fn compute_stat(&mut self) {
145        // compute this datablock's data_size
146        let data_size = self.data_size();
147        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
148        let mut info = self.block_info.0.write().unwrap();
149        info.insert(Stat::DataSize, data_size_array);
150    }
151}
152
153pub trait GetStat: fmt::Debug {
154    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>>;
155
156    fn expect_stat(&self, stat: Stat) -> Arc<dyn Array> {
157        self.get_stat(stat)
158            .unwrap_or_else(|| panic!("{:?} DataBlock does not have `{}` statistics.", self, stat))
159    }
160
161    fn expect_single_stat<T: ArrowPrimitiveType>(&self, stat: Stat) -> T::Native {
162        let stat_value = self.expect_stat(stat);
163        let stat_value = stat_value.as_primitive::<T>();
164        if stat_value.len() != 1 {
165            panic!(
166                "{:?} DataBlock does not have exactly one value for `{} statistics.",
167                self, stat
168            );
169        }
170        stat_value.value(0)
171    }
172}
173
174impl GetStat for DataBlock {
175    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
176        match self {
177            Self::Empty() => None,
178            Self::Constant(_) => None,
179            Self::AllNull(data_block) => data_block.get_stat(stat),
180            Self::Nullable(data_block) => data_block.get_stat(stat),
181            Self::FixedWidth(data_block) => data_block.get_stat(stat),
182            Self::FixedSizeList(data_block) => data_block.get_stat(stat),
183            Self::VariableWidth(data_block) => data_block.get_stat(stat),
184            Self::Opaque(data_block) => data_block.get_stat(stat),
185            Self::Struct(data_block) => data_block.get_stat(stat),
186            Self::Dictionary(data_block) => data_block.get_stat(stat),
187        }
188    }
189}
190
191// NullableDataBlock will be deprecated in Lance 2.1.
192impl GetStat for NullableDataBlock {
193    // This function simply returns the statistics of the inner `DataBlock` of `NullableDataBlock`,
194    // this is not accurate but `NullableDataBlock` is going to be deprecated in Lance 2.1 anyway.
195    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
196        self.data.get_stat(stat)
197    }
198}
199
200impl GetStat for VariableWidthBlock {
201    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
202        let block_info = self.block_info.0.read().unwrap();
203
204        if block_info.is_empty() {
205            panic!("get_stat should be called after statistics are computed.");
206        }
207        block_info.get(&stat).cloned()
208    }
209}
210
211impl GetStat for FixedSizeListBlock {
212    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
213        let child_stat = self.child.get_stat(stat);
214        match stat {
215            Stat::MaxLength => child_stat.map(|max_length| {
216                // this is conservative when working with variable length data as we shouldn't assume
217                // that we have a list of all max-length elements but it's cheap and easy to calculate
218                let max_length = max_length.as_primitive::<UInt64Type>().value(0);
219                Arc::new(UInt64Array::from(vec![max_length * self.dimension])) as Arc<dyn Array>
220            }),
221            _ => child_stat,
222        }
223    }
224}
225
226impl VariableWidthBlock {
227    // Caveat: the computation here assumes VariableWidthBlock.offsets maps directly to VariableWidthBlock.data
228    // without any adjustment(for example, no null_adjustment for offsets)
229    fn cardinality(&mut self) -> Arc<dyn Array> {
230        const PRECISION: u8 = 4;
231        // The default hasher (currently sip hash 1-3) does not seem to give good results
232        // with HLL.
233        //
234        // In particular, when using randomly generated 12-byte strings, the HLL count was
235        // suggested a cardinality of 500 (out of 1000 unique items and hashes) at least 10%
236        // of the time.
237        //
238        // Using xxhash3 consistently gives better results.
239        let mut hll: HyperLogLogPlus<&[u8], xxhash_rust::xxh3::Xxh3Builder> =
240            HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()).unwrap();
241
242        match self.bits_per_offset {
243            32 => {
244                let offsets_ref = self.offsets.borrow_to_typed_slice::<u32>();
245                let offsets: &[u32] = offsets_ref.as_ref();
246
247                offsets
248                    .iter()
249                    .zip(offsets.iter().skip(1))
250                    .for_each(|(&start, &end)| {
251                        hll.insert(&self.data[start as usize..end as usize]);
252                    });
253                let cardinality = hll.count() as u64;
254                Arc::new(UInt64Array::from(vec![cardinality]))
255            }
256            64 => {
257                let offsets_ref = self.offsets.borrow_to_typed_slice::<u64>();
258                let offsets: &[u64] = offsets_ref.as_ref();
259
260                offsets
261                    .iter()
262                    .zip(offsets.iter().skip(1))
263                    .for_each(|(&start, &end)| {
264                        hll.insert(&self.data[start as usize..end as usize]);
265                    });
266
267                let cardinality = hll.count() as u64;
268                Arc::new(UInt64Array::from(vec![cardinality]))
269            }
270            _ => {
271                unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64")
272            }
273        }
274    }
275
276    fn max_length(&mut self) -> Arc<dyn Array> {
277        match self.bits_per_offset {
278            32 => {
279                let offsets = self.offsets.borrow_to_typed_slice::<u32>();
280                let offsets = offsets.as_ref();
281                let max_len = offsets
282                    .windows(2)
283                    .map(|pair| pair[1] - pair[0])
284                    .max()
285                    .unwrap_or(0);
286                Arc::new(UInt64Array::from(vec![max_len as u64]))
287            }
288            64 => {
289                let offsets = self.offsets.borrow_to_typed_slice::<u64>();
290                let offsets = offsets.as_ref();
291                let max_len = offsets
292                    .windows(2)
293                    .map(|pair| pair[1] - pair[0])
294                    .max()
295                    .unwrap_or(0);
296                Arc::new(UInt64Array::from(vec![max_len]))
297            }
298            _ => {
299                unreachable!("the type of offsets in VariableWidth can only be u32 or u64");
300            }
301        }
302    }
303}
304
305impl GetStat for AllNullDataBlock {
306    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
307        match stat {
308            Stat::NullCount => {
309                let null_count = self.num_values;
310                Some(Arc::new(UInt64Array::from(vec![null_count])))
311            }
312            Stat::DataSize => Some(Arc::new(UInt64Array::from(vec![0]))),
313            _ => None,
314        }
315    }
316}
317
318impl GetStat for FixedWidthDataBlock {
319    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
320        let block_info = self.block_info.0.read().unwrap();
321
322        if block_info.is_empty() {
323            panic!("get_stat should be called after statistics are computed.");
324        }
325        block_info.get(&stat).cloned()
326    }
327}
328
329impl FixedWidthDataBlock {
330    fn max_bit_widths(&mut self) -> Arc<dyn Array> {
331        if self.num_values == 0 {
332            return Arc::new(UInt64Array::from(vec![0u64]));
333        }
334
335        const CHUNK_SIZE: usize = 1024;
336
337        fn calculate_max_bit_width<T: PrimInt>(slice: &[T], bits_per_value: u64) -> Vec<u64> {
338            slice
339                .chunks(CHUNK_SIZE)
340                .map(|chunk| {
341                    let max_value = chunk.iter().fold(T::zero(), |acc, &x| acc | x);
342                    bits_per_value - max_value.leading_zeros() as u64
343                })
344                .collect()
345        }
346
347        match self.bits_per_value {
348            8 => {
349                let u8_slice = self.data.borrow_to_typed_slice::<u8>();
350                let u8_slice = u8_slice.as_ref();
351                Arc::new(UInt64Array::from(calculate_max_bit_width(
352                    u8_slice,
353                    self.bits_per_value,
354                )))
355            }
356            16 => {
357                let u16_slice = self.data.borrow_to_typed_slice::<u16>();
358                let u16_slice = u16_slice.as_ref();
359                Arc::new(UInt64Array::from(calculate_max_bit_width(
360                    u16_slice,
361                    self.bits_per_value,
362                )))
363            }
364            32 => {
365                let u32_slice = self.data.borrow_to_typed_slice::<u32>();
366                let u32_slice = u32_slice.as_ref();
367                Arc::new(UInt64Array::from(calculate_max_bit_width(
368                    u32_slice,
369                    self.bits_per_value,
370                )))
371            }
372            64 => {
373                let u64_slice = self.data.borrow_to_typed_slice::<u64>();
374                let u64_slice = u64_slice.as_ref();
375                Arc::new(UInt64Array::from(calculate_max_bit_width(
376                    u64_slice,
377                    self.bits_per_value,
378                )))
379            }
380            _ => Arc::new(UInt64Array::from(vec![self.bits_per_value])),
381        }
382    }
383
384    fn cardinality(&mut self) -> Arc<dyn Array> {
385        match self.bits_per_value {
386            128 => {
387                let u128_slice_ref = self.data.borrow_to_typed_slice::<u128>();
388                let u128_slice = u128_slice_ref.as_ref();
389
390                const PRECISION: u8 = 4;
391                let mut hll: HyperLogLogPlus<u128, RandomState> =
392                    HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
393                for val in u128_slice {
394                    hll.insert(val);
395                }
396                let cardinality = hll.count() as u64;
397                Arc::new(UInt64Array::from(vec![cardinality]))
398            }
399            _ => unreachable!(),
400        }
401    }
402
403    /// Counts the number of runs (consecutive sequences of equal values) in the data.
404    ///
405    /// A "run" is defined as a sequence of one or more consecutive equal values.
406    /// For example:
407    /// - `[1, 1, 2, 2, 2, 3]` has 3 runs: [1,1], [2,2,2], and [3]
408    /// - `[1, 2, 3, 4]` has 4 runs (each value is its own run)
409    /// - `[5, 5, 5, 5]` has 1 run
410    ///
411    /// This count is used to determine if RLE compression would be effective.
412    /// Fewer runs relative to the total number of values indicates better RLE compression potential.
413    fn run_count(&mut self) -> Arc<dyn Array> {
414        if self.num_values == 0 {
415            return Arc::new(UInt64Array::from(vec![0u64]));
416        }
417
418        // Inner function to count runs in typed data
419        fn count_runs<T: PartialEq + Copy>(slice: &[T]) -> u64 {
420            if slice.is_empty() {
421                return 0;
422            }
423
424            // Start with 1 run (the first value)
425            let mut runs = 1u64;
426            let mut prev = slice[0];
427
428            // Count value transitions (each transition indicates a new run)
429            for &val in &slice[1..] {
430                if val != prev {
431                    runs += 1;
432                    prev = val;
433                }
434            }
435
436            runs
437        }
438
439        let run_count = match self.bits_per_value {
440            8 => {
441                let u8_slice = self.data.borrow_to_typed_slice::<u8>();
442                count_runs(u8_slice.as_ref())
443            }
444            16 => {
445                let u16_slice = self.data.borrow_to_typed_slice::<u16>();
446                count_runs(u16_slice.as_ref())
447            }
448            32 => {
449                let u32_slice = self.data.borrow_to_typed_slice::<u32>();
450                count_runs(u32_slice.as_ref())
451            }
452            64 => {
453                let u64_slice = self.data.borrow_to_typed_slice::<u64>();
454                count_runs(u64_slice.as_ref())
455            }
456            128 => {
457                let u128_slice = self.data.borrow_to_typed_slice::<u128>();
458                count_runs(u128_slice.as_ref())
459            }
460            _ => self.num_values, // For other bit widths, assume no runs
461        };
462
463        Arc::new(UInt64Array::from(vec![run_count]))
464    }
465
466    /// Calculates entropy for each byte position.
467    /// Returns an array with entropy values for each byte position (scaled by 1000 for integer storage).
468    /// Lower entropy in specific byte positions indicates better suitability for BSS.
469    fn byte_position_entropy(&mut self) -> Arc<dyn Array> {
470        const SAMPLE_SIZE: usize = 64; // Sample more values for better entropy estimation
471
472        // Get sample size (min of data length and SAMPLE_SIZE)
473        let sample_count = (self.num_values as usize).min(SAMPLE_SIZE);
474
475        if sample_count == 0 {
476            // Return empty array for empty data
477            return Arc::new(UInt64Array::from(vec![] as Vec<u64>));
478        }
479
480        let bytes_per_value = (self.bits_per_value / 8) as usize;
481        let mut entropies = Vec::with_capacity(bytes_per_value);
482
483        // Calculate entropy for each byte position
484        for pos in 0..bytes_per_value {
485            let mut byte_counts = [0u32; 256];
486
487            // Count occurrences of each byte value at this position
488            for i in 0..sample_count {
489                let byte_offset = i * bytes_per_value + pos;
490                if byte_offset < self.data.len() {
491                    byte_counts[self.data[byte_offset] as usize] += 1;
492                }
493            }
494
495            // Calculate Shannon entropy for this position
496            let mut entropy = 0.0f64;
497            let total = sample_count as f64;
498
499            for &count in &byte_counts {
500                if count > 0 {
501                    let p = count as f64 / total;
502                    entropy -= p * p.log2();
503                }
504            }
505
506            // Scale by 1000 and store as integer for efficient storage
507            entropies.push((entropy * 1000.0) as u64);
508        }
509
510        Arc::new(UInt64Array::from(entropies))
511    }
512}
513
514impl GetStat for OpaqueBlock {
515    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
516        let block_info = self.block_info.0.read().unwrap();
517
518        if block_info.is_empty() {
519            panic!("get_stat should be called after statistics are computed.");
520        }
521        block_info.get(&stat).cloned()
522    }
523}
524
525impl GetStat for DictionaryDataBlock {
526    fn get_stat(&self, _stat: Stat) -> Option<Arc<dyn Array>> {
527        None
528    }
529}
530
531impl GetStat for StructDataBlock {
532    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
533        let block_info = self.block_info.0.read().unwrap();
534        if block_info.is_empty() {
535            panic!("get_stat should be called after statistics are computed.")
536        }
537        block_info.get(&stat).cloned()
538    }
539}
540
541impl ComputeStat for StructDataBlock {
542    fn compute_stat(&mut self) {
543        let data_size = self.data_size();
544        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
545
546        let max_len = self
547            .children
548            .iter()
549            .map(|child| child.expect_single_stat::<UInt64Type>(Stat::MaxLength))
550            .sum::<u64>();
551        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
552
553        let mut info = self.block_info.0.write().unwrap();
554        info.insert(Stat::DataSize, data_size_array);
555        info.insert(Stat::MaxLength, max_len_array);
556    }
557}
558
559#[cfg(test)]
560mod tests {
561    use std::sync::Arc;
562
563    use arrow_array::{
564        ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
565        UInt16Array, UInt32Array, UInt64Array, UInt8Array,
566    };
567    use arrow_schema::{DataType, Field};
568    use lance_arrow::DataTypeExt;
569    use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED};
570    use rand::SeedableRng;
571
572    use crate::statistics::{GetStat, Stat};
573
574    use super::DataBlock;
575
576    use arrow::{
577        array::AsArray,
578        compute::concat,
579        datatypes::{Int32Type, UInt64Type},
580    };
581    use arrow_array::Array;
582    #[test]
583    fn test_data_size_stat() {
584        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
585        let mut genn = array::rand::<Int32Type>().with_nulls(&[false, false, false]);
586        let arr1 = genn.generate(RowCount::from(3), &mut rng).unwrap();
587        let arr2 = genn.generate(RowCount::from(3), &mut rng).unwrap();
588        let arr3 = genn.generate(RowCount::from(3), &mut rng).unwrap();
589        let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9);
590
591        let concatenated_array = concat(&[
592            &*Arc::new(arr1.clone()) as &dyn Array,
593            &*Arc::new(arr2.clone()) as &dyn Array,
594            &*Arc::new(arr3.clone()) as &dyn Array,
595        ])
596        .unwrap();
597
598        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
599
600        let total_buffer_size: usize = concatenated_array
601            .to_data()
602            .buffers()
603            .iter()
604            .map(|buffer| buffer.len())
605            .sum();
606        assert!(data_size == total_buffer_size as u64);
607
608        // test DataType::Binary
609        let mut genn = lance_datagen::array::rand_type(&DataType::Binary);
610        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
611        let block = DataBlock::from_array(arr.clone());
612        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
613
614        let total_buffer_size: usize = arr
615            .to_data()
616            .buffers()
617            .iter()
618            .map(|buffer| buffer.len())
619            .sum();
620        assert!(data_size == total_buffer_size as u64);
621
622        // test DataType::Struct
623        let fields = vec![
624            Arc::new(Field::new("int_field", DataType::Int32, false)),
625            Arc::new(Field::new("float_field", DataType::Float32, false)),
626        ]
627        .into();
628
629        let mut genn = lance_datagen::array::rand_type(&DataType::Struct(fields));
630        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
631        let block = DataBlock::from_array(arr.clone());
632        let (_, arr_parts, _) = arr.as_struct().clone().into_parts();
633        let total_buffer_size: usize = arr_parts
634            .iter()
635            .map(|arr| {
636                arr.to_data()
637                    .buffers()
638                    .iter()
639                    .map(|buffer| buffer.len())
640                    .sum::<usize>()
641            })
642            .sum();
643        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
644        assert!(data_size == total_buffer_size as u64);
645
646        // test DataType::Dictionary
647        let mut genn = array::rand_type(&DataType::Dictionary(
648            Box::new(DataType::Int32),
649            Box::new(DataType::Utf8),
650        ));
651        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
652        let block = DataBlock::from_array(arr.clone());
653        assert!(block.get_stat(Stat::DataSize).is_none());
654
655        let mut genn = array::rand::<Int32Type>().with_nulls(&[false, true, false]);
656        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
657        let block = DataBlock::from_array(arr.clone());
658        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
659        let total_buffer_size: usize = arr
660            .to_data()
661            .buffers()
662            .iter()
663            .map(|buffer| buffer.len())
664            .sum();
665
666        assert!(data_size == total_buffer_size as u64);
667    }
668
669    #[test]
670    fn test_bit_width_stat_for_integers() {
671        let int8_array = Int8Array::from(vec![1, 2, 3]);
672        let array_ref: ArrayRef = Arc::new(int8_array);
673        let block = DataBlock::from_array(array_ref);
674
675        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
676        let actual_bit_width = block.expect_stat(Stat::BitWidth);
677
678        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
679
680        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
681        let array_ref: ArrayRef = Arc::new(int8_array);
682        let block = DataBlock::from_array(array_ref);
683
684        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
685        let actual_bit_width = block.expect_stat(Stat::BitWidth);
686        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
687
688        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
689        let array_ref: ArrayRef = Arc::new(int8_array);
690        let block = DataBlock::from_array(array_ref);
691
692        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
693        let actual_bit_width = block.expect_stat(Stat::BitWidth);
694        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
695
696        let int8_array = Int8Array::from(vec![-1, 2, 3]);
697        let array_ref: ArrayRef = Arc::new(int8_array);
698        let block = DataBlock::from_array(array_ref);
699
700        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
701        let actual_bit_width = block.expect_stat(Stat::BitWidth);
702        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
703
704        let int16_array = Int16Array::from(vec![1, 2, 3]);
705        let array_ref: ArrayRef = Arc::new(int16_array);
706        let block = DataBlock::from_array(array_ref);
707
708        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
709        let actual_bit_width = block.expect_stat(Stat::BitWidth);
710        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
711
712        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
713        let array_ref: ArrayRef = Arc::new(int16_array);
714        let block = DataBlock::from_array(array_ref);
715
716        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
717        let actual_bit_width = block.expect_stat(Stat::BitWidth);
718        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
719
720        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
721        let array_ref: ArrayRef = Arc::new(int16_array);
722        let block = DataBlock::from_array(array_ref);
723
724        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
725        let actual_bit_width = block.expect_stat(Stat::BitWidth);
726        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
727
728        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
729        let array_ref: ArrayRef = Arc::new(int16_array);
730        let block = DataBlock::from_array(array_ref);
731
732        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
733        let actual_bit_width = block.expect_stat(Stat::BitWidth);
734        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
735
736        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
737        let array_ref: ArrayRef = Arc::new(int16_array);
738        let block = DataBlock::from_array(array_ref);
739
740        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
741        let actual_bit_width = block.expect_stat(Stat::BitWidth);
742        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
743
744        let int16_array = Int16Array::from(vec![-1, 2, 3]);
745        let array_ref: ArrayRef = Arc::new(int16_array);
746        let block = DataBlock::from_array(array_ref);
747
748        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
749        let actual_bit_width = block.expect_stat(Stat::BitWidth);
750        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
751
752        let int32_array = Int32Array::from(vec![1, 2, 3]);
753        let array_ref: ArrayRef = Arc::new(int32_array);
754        let block = DataBlock::from_array(array_ref);
755
756        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
757        let actual_bit_width = block.expect_stat(Stat::BitWidth);
758        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
759
760        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
761        let array_ref: ArrayRef = Arc::new(int32_array);
762        let block = DataBlock::from_array(array_ref);
763
764        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
765        let actual_bit_width = block.expect_stat(Stat::BitWidth);
766        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
767
768        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
769        let array_ref: ArrayRef = Arc::new(int32_array);
770        let block = DataBlock::from_array(array_ref);
771
772        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
773        let actual_bit_width = block.expect_stat(Stat::BitWidth);
774        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
775
776        let int32_array = Int32Array::from(vec![-1, 2, 3]);
777        let array_ref: ArrayRef = Arc::new(int32_array);
778        let block = DataBlock::from_array(array_ref);
779
780        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
781        let actual_bit_width = block.expect_stat(Stat::BitWidth);
782        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
783
784        let int32_array = Int32Array::from(vec![-1, 2, 3, -88]);
785        let array_ref: ArrayRef = Arc::new(int32_array);
786        let block = DataBlock::from_array(array_ref);
787
788        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
789        let actual_bit_width = block.expect_stat(Stat::BitWidth);
790        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
791
792        let int64_array = Int64Array::from(vec![1, 2, 3]);
793        let array_ref: ArrayRef = Arc::new(int64_array);
794        let block = DataBlock::from_array(array_ref);
795
796        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
797        let actual_bit_width = block.expect_stat(Stat::BitWidth);
798        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
799
800        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
801        let array_ref: ArrayRef = Arc::new(int64_array);
802        let block = DataBlock::from_array(array_ref);
803
804        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
805        let actual_bit_width = block.expect_stat(Stat::BitWidth);
806        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
807
808        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
809        let array_ref: ArrayRef = Arc::new(int64_array);
810        let block = DataBlock::from_array(array_ref);
811
812        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
813        let actual_bit_width = block.expect_stat(Stat::BitWidth);
814        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
815
816        let int64_array = Int64Array::from(vec![-1, 2, 3]);
817        let array_ref: ArrayRef = Arc::new(int64_array);
818        let block = DataBlock::from_array(array_ref);
819
820        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
821        let actual_bit_width = block.expect_stat(Stat::BitWidth);
822        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
823
824        let int64_array = Int64Array::from(vec![-1, 2, 3, -88]);
825        let array_ref: ArrayRef = Arc::new(int64_array);
826        let block = DataBlock::from_array(array_ref);
827
828        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
829        let actual_bit_width = block.expect_stat(Stat::BitWidth);
830        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
831
832        let uint8_array = UInt8Array::from(vec![1, 2, 3]);
833        let array_ref: ArrayRef = Arc::new(uint8_array);
834        let block = DataBlock::from_array(array_ref);
835
836        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
837        let actual_bit_width = block.expect_stat(Stat::BitWidth);
838        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
839
840        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
841        let array_ref: ArrayRef = Arc::new(uint8_array);
842        let block = DataBlock::from_array(array_ref);
843
844        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
845        let actual_bit_width = block.expect_stat(Stat::BitWidth);
846        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
847
848        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
849        let array_ref: ArrayRef = Arc::new(uint8_array);
850        let block = DataBlock::from_array(array_ref);
851
852        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
853        let actual_bit_width = block.expect_stat(Stat::BitWidth);
854        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
855
856        let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]);
857        let array_ref: ArrayRef = Arc::new(uint8_array);
858        let block = DataBlock::from_array(array_ref);
859
860        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
861        let actual_bit_width = block.expect_stat(Stat::BitWidth);
862        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
863
864        let uint16_array = UInt16Array::from(vec![1, 2, 3]);
865        let array_ref: ArrayRef = Arc::new(uint16_array);
866        let block = DataBlock::from_array(array_ref);
867
868        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
869        let actual_bit_width = block.expect_stat(Stat::BitWidth);
870        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
871
872        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
873        let array_ref: ArrayRef = Arc::new(uint16_array);
874        let block = DataBlock::from_array(array_ref);
875
876        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
877        let actual_bit_width = block.expect_stat(Stat::BitWidth);
878        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
879
880        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
881        let array_ref: ArrayRef = Arc::new(uint16_array);
882        let block = DataBlock::from_array(array_ref);
883
884        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
885        let actual_bit_width = block.expect_stat(Stat::BitWidth);
886        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
887
888        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
889        let array_ref: ArrayRef = Arc::new(uint16_array);
890        let block = DataBlock::from_array(array_ref);
891
892        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
893        let actual_bit_width = block.expect_stat(Stat::BitWidth);
894        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
895
896        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
897        let array_ref: ArrayRef = Arc::new(uint16_array);
898        let block = DataBlock::from_array(array_ref);
899
900        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
901        let actual_bit_width = block.expect_stat(Stat::BitWidth);
902        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
903
904        let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]);
905        let array_ref: ArrayRef = Arc::new(uint16_array);
906        let block = DataBlock::from_array(array_ref);
907
908        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
909        let actual_bit_width = block.expect_stat(Stat::BitWidth);
910        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
911
912        let uint32_array = UInt32Array::from(vec![1, 2, 3]);
913        let array_ref: ArrayRef = Arc::new(uint32_array);
914        let block = DataBlock::from_array(array_ref);
915
916        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
917        let actual_bit_width = block.expect_stat(Stat::BitWidth);
918        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
919
920        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
921        let array_ref: ArrayRef = Arc::new(uint32_array);
922        let block = DataBlock::from_array(array_ref);
923
924        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
925        let actual_bit_width = block.expect_stat(Stat::BitWidth);
926        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
927
928        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
929        let array_ref: ArrayRef = Arc::new(uint32_array);
930        let block = DataBlock::from_array(array_ref);
931
932        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
933        let actual_bit_width = block.expect_stat(Stat::BitWidth);
934        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
935
936        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]);
937        let array_ref: ArrayRef = Arc::new(uint32_array);
938        let block = DataBlock::from_array(array_ref);
939
940        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
941        let actual_bit_width = block.expect_stat(Stat::BitWidth);
942        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
943
944        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]);
945        let array_ref: ArrayRef = Arc::new(uint32_array);
946        let block = DataBlock::from_array(array_ref);
947
948        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
949        let actual_bit_width = block.expect_stat(Stat::BitWidth);
950        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
951
952        let uint64_array = UInt64Array::from(vec![1, 2, 3]);
953        let array_ref: ArrayRef = Arc::new(uint64_array);
954        let block = DataBlock::from_array(array_ref);
955
956        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
957        let actual_bit_width = block.expect_stat(Stat::BitWidth);
958        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
959
960        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
961        let array_ref: ArrayRef = Arc::new(uint64_array);
962        let block = DataBlock::from_array(array_ref);
963
964        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
965        let actual_bit_width = block.expect_stat(Stat::BitWidth);
966        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
967
968        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
969        let array_ref: ArrayRef = Arc::new(uint64_array);
970        let block = DataBlock::from_array(array_ref);
971
972        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
973        let actual_bit_width = block.expect_stat(Stat::BitWidth);
974        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
975
976        let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]);
977        let array_ref: ArrayRef = Arc::new(uint64_array);
978        let block = DataBlock::from_array(array_ref);
979
980        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
981        let actual_bit_width = block.expect_stat(Stat::BitWidth);
982        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
983
984        let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]);
985        let array_ref: ArrayRef = Arc::new(uint64_array);
986        let block = DataBlock::from_array(array_ref);
987
988        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
989        let actual_bit_width = block.expect_stat(Stat::BitWidth);
990        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
991    }
992
993    #[test]
994    fn test_bit_width_stat_more_than_1024() {
995        for data_type in [
996            DataType::Int8,
997            DataType::Int16,
998            DataType::Int32,
999            DataType::Int64,
1000        ] {
1001            let array1 = Int64Array::from(vec![3; 1024]);
1002            let array2 = Int64Array::from(vec![8; 1024]);
1003            let array3 = Int64Array::from(vec![-1; 10]);
1004            let array1 = arrow_cast::cast(&array1, &data_type).unwrap();
1005            let array2 = arrow_cast::cast(&array2, &data_type).unwrap();
1006            let array3 = arrow_cast::cast(&array3, &data_type).unwrap();
1007
1008            let arrays: Vec<&dyn arrow::array::Array> =
1009                vec![array1.as_ref(), array2.as_ref(), array3.as_ref()];
1010            let concatenated = concat(&arrays).unwrap();
1011            let block = DataBlock::from_array(concatenated.clone());
1012
1013            let expected_bit_width = Arc::new(UInt64Array::from(vec![
1014                2,
1015                4,
1016                (data_type.byte_width() * 8) as u64,
1017            ])) as ArrayRef;
1018            let actual_bit_widths = block.expect_stat(Stat::BitWidth);
1019            assert_eq!(actual_bit_widths.as_ref(), expected_bit_width.as_ref(),);
1020        }
1021    }
1022
1023    #[test]
1024    fn test_bit_width_when_none() {
1025        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
1026        let mut genn = lance_datagen::array::rand_type(&DataType::Binary);
1027        let arr = genn.generate(RowCount::from(3), &mut rng).unwrap();
1028        let block = DataBlock::from_array(arr.clone());
1029        assert!(block.get_stat(Stat::BitWidth).is_none(),);
1030    }
1031
1032    #[test]
1033    fn test_cardinality_variable_width_datablock() {
1034        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
1035        let block = DataBlock::from_array(string_array);
1036        let expected_cardinality = 2;
1037        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1038        assert_eq!(actual_cardinality, expected_cardinality,);
1039
1040        let string_array = StringArray::from(vec![
1041            Some("to be named by variables"),
1042            Some("to be passed as arguments to procedures"),
1043            Some("to be returned as values of procedures"),
1044        ]);
1045        let block = DataBlock::from_array(string_array);
1046        let expected_cardinality = 3;
1047        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1048
1049        assert_eq!(actual_cardinality, expected_cardinality,);
1050
1051        let string_array = StringArray::from(vec![
1052            Some("Samuel Eilenberg"),
1053            Some("Saunders Mac Lane"),
1054            Some("Samuel Eilenberg"),
1055        ]);
1056        let block = DataBlock::from_array(string_array);
1057        let expected_cardinality = 2;
1058        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1059        assert_eq!(actual_cardinality, expected_cardinality,);
1060
1061        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
1062        let block = DataBlock::from_array(string_array);
1063        let expected_cardinality = 2;
1064        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1065        assert_eq!(actual_cardinality, expected_cardinality,);
1066
1067        let string_array = LargeStringArray::from(vec![
1068            Some("to be named by variables"),
1069            Some("to be passed as arguments to procedures"),
1070            Some("to be returned as values of procedures"),
1071        ]);
1072        let block = DataBlock::from_array(string_array);
1073        let expected_cardinality = 3;
1074        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1075        assert_eq!(actual_cardinality, expected_cardinality,);
1076
1077        let string_array = LargeStringArray::from(vec![
1078            Some("Samuel Eilenberg"),
1079            Some("Saunders Mac Lane"),
1080            Some("Samuel Eilenberg"),
1081        ]);
1082        let block = DataBlock::from_array(string_array);
1083        let expected_cardinality = 2;
1084        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1085        assert_eq!(actual_cardinality, expected_cardinality,);
1086    }
1087
1088    #[test]
1089    fn test_max_length_variable_width_datablock() {
1090        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
1091        let block = DataBlock::from_array(string_array.clone());
1092        let expected_max_length = string_array.value_length(0) as u64;
1093        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1094        assert_eq!(actual_max_length, expected_max_length);
1095
1096        let string_array = StringArray::from(vec![
1097            Some("to be named by variables"),
1098            Some("to be passed as arguments to procedures"), // string that has max length
1099            Some("to be returned as values of procedures"),
1100        ]);
1101        let block = DataBlock::from_array(string_array.clone());
1102        let expected_max_length = string_array.value_length(1) as u64;
1103        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1104        assert_eq!(actual_max_length, expected_max_length);
1105
1106        let string_array = StringArray::from(vec![
1107            Some("Samuel Eilenberg"),
1108            Some("Saunders Mac Lane"), // string that has max length
1109            Some("Samuel Eilenberg"),
1110        ]);
1111        let block = DataBlock::from_array(string_array.clone());
1112        let expected_max_length = string_array.value_length(1) as u64;
1113        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1114        assert_eq!(actual_max_length, expected_max_length);
1115
1116        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
1117        let block = DataBlock::from_array(string_array.clone());
1118        let expected_max_length = string_array.value_length(1) as u64;
1119        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1120        assert_eq!(actual_max_length, expected_max_length);
1121
1122        let string_array = LargeStringArray::from(vec![
1123            Some("to be named by variables"),
1124            Some("to be passed as arguments to procedures"), // string that has max length
1125            Some("to be returned as values of procedures"),
1126        ]);
1127        let block = DataBlock::from_array(string_array.clone());
1128        let expected_max_length = string_array.value(1).len() as u64;
1129        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1130
1131        assert_eq!(actual_max_length, expected_max_length);
1132    }
1133
1134    #[test]
1135    fn test_run_count_stat() {
1136        // Test with highly repetitive data
1137        let int32_array = Int32Array::from(vec![1, 1, 1, 2, 2, 2, 3, 3, 3]);
1138        let block = DataBlock::from_array(int32_array);
1139        let expected_run_count = 3;
1140        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1141        assert_eq!(actual_run_count, expected_run_count);
1142
1143        // Test with no repetition
1144        let int32_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
1145        let block = DataBlock::from_array(int32_array);
1146        let expected_run_count = 5;
1147        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1148        assert_eq!(actual_run_count, expected_run_count);
1149
1150        // Test with mixed pattern
1151        let int32_array = Int32Array::from(vec![1, 1, 2, 3, 3, 3, 4, 5, 5]);
1152        let block = DataBlock::from_array(int32_array);
1153        let expected_run_count = 5;
1154        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1155        assert_eq!(actual_run_count, expected_run_count);
1156
1157        // Test with single value
1158        let int32_array = Int32Array::from(vec![42, 42, 42, 42, 42]);
1159        let block = DataBlock::from_array(int32_array);
1160        let expected_run_count = 1;
1161        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1162        assert_eq!(actual_run_count, expected_run_count);
1163
1164        // Test with different data types
1165        let uint8_array = UInt8Array::from(vec![1, 1, 2, 2, 3, 3]);
1166        let block = DataBlock::from_array(uint8_array);
1167        let expected_run_count = 3;
1168        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1169        assert_eq!(actual_run_count, expected_run_count);
1170
1171        let int64_array = Int64Array::from(vec![100, 100, 200, 300, 300]);
1172        let block = DataBlock::from_array(int64_array);
1173        let expected_run_count = 3;
1174        let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1175        assert_eq!(actual_run_count, expected_run_count);
1176    }
1177}