1use std::{
5 fmt::{self},
6 hash::{Hash, RandomState},
7 sync::Arc,
8};
9
10use arrow::{array::AsArray, datatypes::UInt64Type};
11use arrow_array::{Array, ArrowPrimitiveType, UInt64Array};
12use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
13use num_traits::PrimInt;
14
15use crate::data::{
16 AllNullDataBlock, DataBlock, DictionaryDataBlock, FixedSizeListBlock, FixedWidthDataBlock,
17 NullableDataBlock, OpaqueBlock, StructDataBlock, VariableWidthBlock,
18};
19
20#[derive(Clone, Copy, PartialEq, Eq, Hash)]
21pub enum Stat {
22 BitWidth,
23 DataSize,
24 Cardinality,
25 FixedSize,
26 NullCount,
27 MaxLength,
28 RunCount,
29}
30
31impl fmt::Debug for Stat {
32 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
33 match self {
34 Self::BitWidth => write!(f, "BitWidth"),
35 Self::DataSize => write!(f, "DataSize"),
36 Self::Cardinality => write!(f, "Cardinality"),
37 Self::FixedSize => write!(f, "FixedSize"),
38 Self::NullCount => write!(f, "NullCount"),
39 Self::MaxLength => write!(f, "MaxLength"),
40 Self::RunCount => write!(f, "RunCount"),
41 }
42 }
43}
44
45impl fmt::Display for Stat {
46 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47 write!(f, "{:?}", self)
48 }
49}
50
51pub trait ComputeStat {
52 fn compute_stat(&mut self);
53}
54
55impl ComputeStat for DataBlock {
56 fn compute_stat(&mut self) {
57 match self {
58 Self::Empty() => {}
59 Self::Constant(_) => {}
60 Self::AllNull(_) => {}
61 Self::Nullable(data_block) => data_block.data.compute_stat(),
62 Self::FixedWidth(data_block) => data_block.compute_stat(),
63 Self::FixedSizeList(data_block) => data_block.compute_stat(),
64 Self::VariableWidth(data_block) => data_block.compute_stat(),
65 Self::Opaque(data_block) => data_block.compute_stat(),
66 Self::Struct(data_block) => data_block.compute_stat(),
67 Self::Dictionary(_) => {}
68 }
69 }
70}
71
72impl ComputeStat for VariableWidthBlock {
73 fn compute_stat(&mut self) {
74 if !self.block_info.0.read().unwrap().is_empty() {
75 panic!("compute_stat should only be called once during DataBlock construction");
76 }
77 let data_size = self.data_size();
78 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
79
80 let cardinality_array = self.cardinality();
81
82 let max_length_array = self.max_length();
83
84 let mut info = self.block_info.0.write().unwrap();
85 info.insert(Stat::DataSize, data_size_array);
86 info.insert(Stat::Cardinality, cardinality_array);
87 info.insert(Stat::MaxLength, max_length_array);
88 }
89}
90
91impl ComputeStat for FixedWidthDataBlock {
92 fn compute_stat(&mut self) {
93 let data_size = self.data_size();
95 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
96
97 let max_bit_widths = self.max_bit_widths();
99
100 let max_len = self.bits_per_value / 8;
102 let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
103
104 let cardidinality_array = if self.bits_per_value == 128 {
105 Some(self.cardinality())
106 } else {
107 None
108 };
109
110 let run_count_array = self.run_count();
112
113 let mut info = self.block_info.0.write().unwrap();
114 info.insert(Stat::DataSize, data_size_array);
115 info.insert(Stat::BitWidth, max_bit_widths);
116 info.insert(Stat::MaxLength, max_len_array);
117 info.insert(Stat::RunCount, run_count_array);
118 if let Some(cardinality_array) = cardidinality_array {
119 info.insert(Stat::Cardinality, cardinality_array);
120 }
121 }
122}
123
124impl ComputeStat for FixedSizeListBlock {
125 fn compute_stat(&mut self) {
126 self.child.compute_stat();
134 }
135}
136
137impl ComputeStat for OpaqueBlock {
138 fn compute_stat(&mut self) {
139 let data_size = self.data_size();
141 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
142 let mut info = self.block_info.0.write().unwrap();
143 info.insert(Stat::DataSize, data_size_array);
144 }
145}
146
147pub trait GetStat: fmt::Debug {
148 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>>;
149
150 fn expect_stat(&self, stat: Stat) -> Arc<dyn Array> {
151 self.get_stat(stat)
152 .unwrap_or_else(|| panic!("{:?} DataBlock does not have `{}` statistics.", self, stat))
153 }
154
155 fn expect_single_stat<T: ArrowPrimitiveType>(&self, stat: Stat) -> T::Native {
156 let stat_value = self.expect_stat(stat);
157 let stat_value = stat_value.as_primitive::<T>();
158 if stat_value.len() != 1 {
159 panic!(
160 "{:?} DataBlock does not have exactly one value for `{} statistics.",
161 self, stat
162 );
163 }
164 stat_value.value(0)
165 }
166}
167
168impl GetStat for DataBlock {
169 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
170 match self {
171 Self::Empty() => None,
172 Self::Constant(_) => None,
173 Self::AllNull(data_block) => data_block.get_stat(stat),
174 Self::Nullable(data_block) => data_block.get_stat(stat),
175 Self::FixedWidth(data_block) => data_block.get_stat(stat),
176 Self::FixedSizeList(data_block) => data_block.get_stat(stat),
177 Self::VariableWidth(data_block) => data_block.get_stat(stat),
178 Self::Opaque(data_block) => data_block.get_stat(stat),
179 Self::Struct(data_block) => data_block.get_stat(stat),
180 Self::Dictionary(data_block) => data_block.get_stat(stat),
181 }
182 }
183}
184
185impl GetStat for NullableDataBlock {
187 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
190 self.data.get_stat(stat)
191 }
192}
193
194impl GetStat for VariableWidthBlock {
195 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
196 let block_info = self.block_info.0.read().unwrap();
197
198 if block_info.is_empty() {
199 panic!("get_stat should be called after statistics are computed.");
200 }
201 block_info.get(&stat).cloned()
202 }
203}
204
205impl GetStat for FixedSizeListBlock {
206 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
207 let child_stat = self.child.get_stat(stat);
208 match stat {
209 Stat::MaxLength => child_stat.map(|max_length| {
210 let max_length = max_length.as_primitive::<UInt64Type>().value(0);
213 Arc::new(UInt64Array::from(vec![max_length * self.dimension])) as Arc<dyn Array>
214 }),
215 _ => child_stat,
216 }
217 }
218}
219
220impl VariableWidthBlock {
221 fn cardinality(&mut self) -> Arc<dyn Array> {
224 const PRECISION: u8 = 4;
225 let mut hll: HyperLogLogPlus<&[u8], xxhash_rust::xxh3::Xxh3Builder> =
234 HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()).unwrap();
235
236 match self.bits_per_offset {
237 32 => {
238 let offsets_ref = self.offsets.borrow_to_typed_slice::<u32>();
239 let offsets: &[u32] = offsets_ref.as_ref();
240
241 offsets
242 .iter()
243 .zip(offsets.iter().skip(1))
244 .for_each(|(&start, &end)| {
245 hll.insert(&self.data[start as usize..end as usize]);
246 });
247 let cardinality = hll.count() as u64;
248 Arc::new(UInt64Array::from(vec![cardinality]))
249 }
250 64 => {
251 let offsets_ref = self.offsets.borrow_to_typed_slice::<u64>();
252 let offsets: &[u64] = offsets_ref.as_ref();
253
254 offsets
255 .iter()
256 .zip(offsets.iter().skip(1))
257 .for_each(|(&start, &end)| {
258 hll.insert(&self.data[start as usize..end as usize]);
259 });
260
261 let cardinality = hll.count() as u64;
262 Arc::new(UInt64Array::from(vec![cardinality]))
263 }
264 _ => {
265 unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64")
266 }
267 }
268 }
269
270 fn max_length(&mut self) -> Arc<dyn Array> {
271 match self.bits_per_offset {
272 32 => {
273 let offsets = self.offsets.borrow_to_typed_slice::<u32>();
274 let offsets = offsets.as_ref();
275 let max_len = offsets
276 .windows(2)
277 .map(|pair| pair[1] - pair[0])
278 .max()
279 .unwrap_or(0);
280 Arc::new(UInt64Array::from(vec![max_len as u64]))
281 }
282 64 => {
283 let offsets = self.offsets.borrow_to_typed_slice::<u64>();
284 let offsets = offsets.as_ref();
285 let max_len = offsets
286 .windows(2)
287 .map(|pair| pair[1] - pair[0])
288 .max()
289 .unwrap_or(0);
290 Arc::new(UInt64Array::from(vec![max_len]))
291 }
292 _ => {
293 unreachable!("the type of offsets in VariableWidth can only be u32 or u64");
294 }
295 }
296 }
297}
298
299impl GetStat for AllNullDataBlock {
300 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
301 match stat {
302 Stat::NullCount => {
303 let null_count = self.num_values;
304 Some(Arc::new(UInt64Array::from(vec![null_count])))
305 }
306 Stat::DataSize => Some(Arc::new(UInt64Array::from(vec![0]))),
307 _ => None,
308 }
309 }
310}
311
312impl GetStat for FixedWidthDataBlock {
313 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
314 let block_info = self.block_info.0.read().unwrap();
315
316 if block_info.is_empty() {
317 panic!("get_stat should be called after statistics are computed.");
318 }
319 block_info.get(&stat).cloned()
320 }
321}
322
323impl FixedWidthDataBlock {
324 fn max_bit_widths(&mut self) -> Arc<dyn Array> {
325 assert!(self.num_values > 0);
326
327 const CHUNK_SIZE: usize = 1024;
328
329 fn calculate_max_bit_width<T: PrimInt>(slice: &[T], bits_per_value: u64) -> Vec<u64> {
330 slice
331 .chunks(CHUNK_SIZE)
332 .map(|chunk| {
333 let max_value = chunk.iter().fold(T::zero(), |acc, &x| acc | x);
334 bits_per_value - max_value.leading_zeros() as u64
335 })
336 .collect()
337 }
338
339 match self.bits_per_value {
340 8 => {
341 let u8_slice = self.data.borrow_to_typed_slice::<u8>();
342 let u8_slice = u8_slice.as_ref();
343 Arc::new(UInt64Array::from(calculate_max_bit_width(
344 u8_slice,
345 self.bits_per_value,
346 )))
347 }
348 16 => {
349 let u16_slice = self.data.borrow_to_typed_slice::<u16>();
350 let u16_slice = u16_slice.as_ref();
351 Arc::new(UInt64Array::from(calculate_max_bit_width(
352 u16_slice,
353 self.bits_per_value,
354 )))
355 }
356 32 => {
357 let u32_slice = self.data.borrow_to_typed_slice::<u32>();
358 let u32_slice = u32_slice.as_ref();
359 Arc::new(UInt64Array::from(calculate_max_bit_width(
360 u32_slice,
361 self.bits_per_value,
362 )))
363 }
364 64 => {
365 let u64_slice = self.data.borrow_to_typed_slice::<u64>();
366 let u64_slice = u64_slice.as_ref();
367 Arc::new(UInt64Array::from(calculate_max_bit_width(
368 u64_slice,
369 self.bits_per_value,
370 )))
371 }
372 _ => Arc::new(UInt64Array::from(vec![self.bits_per_value])),
373 }
374 }
375
376 fn cardinality(&mut self) -> Arc<dyn Array> {
377 match self.bits_per_value {
378 128 => {
379 let u128_slice_ref = self.data.borrow_to_typed_slice::<u128>();
380 let u128_slice = u128_slice_ref.as_ref();
381
382 const PRECISION: u8 = 4;
383 let mut hll: HyperLogLogPlus<u128, RandomState> =
384 HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
385 for val in u128_slice {
386 hll.insert(val);
387 }
388 let cardinality = hll.count() as u64;
389 Arc::new(UInt64Array::from(vec![cardinality]))
390 }
391 _ => unreachable!(),
392 }
393 }
394
395 fn run_count(&mut self) -> Arc<dyn Array> {
406 assert!(self.num_values > 0);
407
408 fn count_runs<T: PartialEq + Copy>(slice: &[T]) -> u64 {
410 if slice.is_empty() {
411 return 0;
412 }
413
414 let mut runs = 1u64;
416 let mut prev = slice[0];
417
418 for &val in &slice[1..] {
420 if val != prev {
421 runs += 1;
422 prev = val;
423 }
424 }
425
426 runs
427 }
428
429 let run_count = match self.bits_per_value {
430 8 => {
431 let u8_slice = self.data.borrow_to_typed_slice::<u8>();
432 count_runs(u8_slice.as_ref())
433 }
434 16 => {
435 let u16_slice = self.data.borrow_to_typed_slice::<u16>();
436 count_runs(u16_slice.as_ref())
437 }
438 32 => {
439 let u32_slice = self.data.borrow_to_typed_slice::<u32>();
440 count_runs(u32_slice.as_ref())
441 }
442 64 => {
443 let u64_slice = self.data.borrow_to_typed_slice::<u64>();
444 count_runs(u64_slice.as_ref())
445 }
446 128 => {
447 let u128_slice = self.data.borrow_to_typed_slice::<u128>();
448 count_runs(u128_slice.as_ref())
449 }
450 _ => self.num_values, };
452
453 Arc::new(UInt64Array::from(vec![run_count]))
454 }
455}
456
457impl GetStat for OpaqueBlock {
458 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
459 let block_info = self.block_info.0.read().unwrap();
460
461 if block_info.is_empty() {
462 panic!("get_stat should be called after statistics are computed.");
463 }
464 block_info.get(&stat).cloned()
465 }
466}
467
468impl GetStat for DictionaryDataBlock {
469 fn get_stat(&self, _stat: Stat) -> Option<Arc<dyn Array>> {
470 None
471 }
472}
473
474impl GetStat for StructDataBlock {
475 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
476 let block_info = self.block_info.0.read().unwrap();
477 if block_info.is_empty() {
478 panic!("get_stat should be called after statistics are computed.")
479 }
480 block_info.get(&stat).cloned()
481 }
482}
483
484impl ComputeStat for StructDataBlock {
485 fn compute_stat(&mut self) {
486 let data_size = self.data_size();
487 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
488
489 let max_len = self
490 .children
491 .iter()
492 .map(|child| child.expect_single_stat::<UInt64Type>(Stat::MaxLength))
493 .sum::<u64>();
494 let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
495
496 let mut info = self.block_info.0.write().unwrap();
497 info.insert(Stat::DataSize, data_size_array);
498 info.insert(Stat::MaxLength, max_len_array);
499 }
500}
501
502#[cfg(test)]
503mod tests {
504 use std::sync::Arc;
505
506 use arrow_array::{
507 ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
508 UInt16Array, UInt32Array, UInt64Array, UInt8Array,
509 };
510 use arrow_schema::{DataType, Field};
511 use lance_arrow::DataTypeExt;
512 use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED};
513 use rand::SeedableRng;
514
515 use crate::statistics::{GetStat, Stat};
516
517 use super::DataBlock;
518
519 use arrow::{
520 array::AsArray,
521 compute::concat,
522 datatypes::{Int32Type, UInt64Type},
523 };
524 use arrow_array::Array;
525 #[test]
526 fn test_data_size_stat() {
527 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
528 let mut gen = array::rand::<Int32Type>().with_nulls(&[false, false, false]);
529 let arr1 = gen.generate(RowCount::from(3), &mut rng).unwrap();
530 let arr2 = gen.generate(RowCount::from(3), &mut rng).unwrap();
531 let arr3 = gen.generate(RowCount::from(3), &mut rng).unwrap();
532 let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9);
533
534 let concatenated_array = concat(&[
535 &*Arc::new(arr1.clone()) as &dyn Array,
536 &*Arc::new(arr2.clone()) as &dyn Array,
537 &*Arc::new(arr3.clone()) as &dyn Array,
538 ])
539 .unwrap();
540
541 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
542
543 let total_buffer_size: usize = concatenated_array
544 .to_data()
545 .buffers()
546 .iter()
547 .map(|buffer| buffer.len())
548 .sum();
549 assert!(data_size == total_buffer_size as u64);
550
551 let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
553 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
554 let block = DataBlock::from_array(arr.clone());
555 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
556
557 let total_buffer_size: usize = arr
558 .to_data()
559 .buffers()
560 .iter()
561 .map(|buffer| buffer.len())
562 .sum();
563 assert!(data_size == total_buffer_size as u64);
564
565 let fields = vec![
567 Arc::new(Field::new("int_field", DataType::Int32, false)),
568 Arc::new(Field::new("float_field", DataType::Float32, false)),
569 ]
570 .into();
571
572 let mut gen = lance_datagen::array::rand_type(&DataType::Struct(fields));
573 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
574 let block = DataBlock::from_array(arr.clone());
575 let (_, arr_parts, _) = arr.as_struct().clone().into_parts();
576 let total_buffer_size: usize = arr_parts
577 .iter()
578 .map(|arr| {
579 arr.to_data()
580 .buffers()
581 .iter()
582 .map(|buffer| buffer.len())
583 .sum::<usize>()
584 })
585 .sum();
586 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
587 assert!(data_size == total_buffer_size as u64);
588
589 let mut gen = array::rand_type(&DataType::Dictionary(
591 Box::new(DataType::Int32),
592 Box::new(DataType::Utf8),
593 ));
594 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
595 let block = DataBlock::from_array(arr.clone());
596 assert!(block.get_stat(Stat::DataSize).is_none());
597
598 let mut gen = array::rand::<Int32Type>().with_nulls(&[false, true, false]);
599 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
600 let block = DataBlock::from_array(arr.clone());
601 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
602 let total_buffer_size: usize = arr
603 .to_data()
604 .buffers()
605 .iter()
606 .map(|buffer| buffer.len())
607 .sum();
608
609 assert!(data_size == total_buffer_size as u64);
610 }
611
612 #[test]
613 fn test_bit_width_stat_for_integers() {
614 let int8_array = Int8Array::from(vec![1, 2, 3]);
615 let array_ref: ArrayRef = Arc::new(int8_array);
616 let block = DataBlock::from_array(array_ref);
617
618 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
619 let actual_bit_width = block.expect_stat(Stat::BitWidth);
620
621 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
622
623 let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
624 let array_ref: ArrayRef = Arc::new(int8_array);
625 let block = DataBlock::from_array(array_ref);
626
627 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
628 let actual_bit_width = block.expect_stat(Stat::BitWidth);
629 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
630
631 let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
632 let array_ref: ArrayRef = Arc::new(int8_array);
633 let block = DataBlock::from_array(array_ref);
634
635 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
636 let actual_bit_width = block.expect_stat(Stat::BitWidth);
637 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
638
639 let int8_array = Int8Array::from(vec![-1, 2, 3]);
640 let array_ref: ArrayRef = Arc::new(int8_array);
641 let block = DataBlock::from_array(array_ref);
642
643 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
644 let actual_bit_width = block.expect_stat(Stat::BitWidth);
645 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
646
647 let int16_array = Int16Array::from(vec![1, 2, 3]);
648 let array_ref: ArrayRef = Arc::new(int16_array);
649 let block = DataBlock::from_array(array_ref);
650
651 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
652 let actual_bit_width = block.expect_stat(Stat::BitWidth);
653 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
654
655 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
656 let array_ref: ArrayRef = Arc::new(int16_array);
657 let block = DataBlock::from_array(array_ref);
658
659 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
660 let actual_bit_width = block.expect_stat(Stat::BitWidth);
661 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
662
663 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
664 let array_ref: ArrayRef = Arc::new(int16_array);
665 let block = DataBlock::from_array(array_ref);
666
667 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
668 let actual_bit_width = block.expect_stat(Stat::BitWidth);
669 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
670
671 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
672 let array_ref: ArrayRef = Arc::new(int16_array);
673 let block = DataBlock::from_array(array_ref);
674
675 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
676 let actual_bit_width = block.expect_stat(Stat::BitWidth);
677 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
678
679 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
680 let array_ref: ArrayRef = Arc::new(int16_array);
681 let block = DataBlock::from_array(array_ref);
682
683 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
684 let actual_bit_width = block.expect_stat(Stat::BitWidth);
685 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
686
687 let int16_array = Int16Array::from(vec![-1, 2, 3]);
688 let array_ref: ArrayRef = Arc::new(int16_array);
689 let block = DataBlock::from_array(array_ref);
690
691 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
692 let actual_bit_width = block.expect_stat(Stat::BitWidth);
693 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
694
695 let int32_array = Int32Array::from(vec![1, 2, 3]);
696 let array_ref: ArrayRef = Arc::new(int32_array);
697 let block = DataBlock::from_array(array_ref);
698
699 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
700 let actual_bit_width = block.expect_stat(Stat::BitWidth);
701 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
702
703 let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
704 let array_ref: ArrayRef = Arc::new(int32_array);
705 let block = DataBlock::from_array(array_ref);
706
707 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
708 let actual_bit_width = block.expect_stat(Stat::BitWidth);
709 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
710
711 let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
712 let array_ref: ArrayRef = Arc::new(int32_array);
713 let block = DataBlock::from_array(array_ref);
714
715 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
716 let actual_bit_width = block.expect_stat(Stat::BitWidth);
717 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
718
719 let int32_array = Int32Array::from(vec![-1, 2, 3]);
720 let array_ref: ArrayRef = Arc::new(int32_array);
721 let block = DataBlock::from_array(array_ref);
722
723 let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
724 let actual_bit_width = block.expect_stat(Stat::BitWidth);
725 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
726
727 let int32_array = Int32Array::from(vec![-1, 2, 3, -88]);
728 let array_ref: ArrayRef = Arc::new(int32_array);
729 let block = DataBlock::from_array(array_ref);
730
731 let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
732 let actual_bit_width = block.expect_stat(Stat::BitWidth);
733 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
734
735 let int64_array = Int64Array::from(vec![1, 2, 3]);
736 let array_ref: ArrayRef = Arc::new(int64_array);
737 let block = DataBlock::from_array(array_ref);
738
739 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
740 let actual_bit_width = block.expect_stat(Stat::BitWidth);
741 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
742
743 let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
744 let array_ref: ArrayRef = Arc::new(int64_array);
745 let block = DataBlock::from_array(array_ref);
746
747 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
748 let actual_bit_width = block.expect_stat(Stat::BitWidth);
749 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
750
751 let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
752 let array_ref: ArrayRef = Arc::new(int64_array);
753 let block = DataBlock::from_array(array_ref);
754
755 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
756 let actual_bit_width = block.expect_stat(Stat::BitWidth);
757 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
758
759 let int64_array = Int64Array::from(vec![-1, 2, 3]);
760 let array_ref: ArrayRef = Arc::new(int64_array);
761 let block = DataBlock::from_array(array_ref);
762
763 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
764 let actual_bit_width = block.expect_stat(Stat::BitWidth);
765 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
766
767 let int64_array = Int64Array::from(vec![-1, 2, 3, -88]);
768 let array_ref: ArrayRef = Arc::new(int64_array);
769 let block = DataBlock::from_array(array_ref);
770
771 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
772 let actual_bit_width = block.expect_stat(Stat::BitWidth);
773 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
774
775 let uint8_array = UInt8Array::from(vec![1, 2, 3]);
776 let array_ref: ArrayRef = Arc::new(uint8_array);
777 let block = DataBlock::from_array(array_ref);
778
779 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
780 let actual_bit_width = block.expect_stat(Stat::BitWidth);
781 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
782
783 let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
784 let array_ref: ArrayRef = Arc::new(uint8_array);
785 let block = DataBlock::from_array(array_ref);
786
787 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
788 let actual_bit_width = block.expect_stat(Stat::BitWidth);
789 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
790
791 let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
792 let array_ref: ArrayRef = Arc::new(uint8_array);
793 let block = DataBlock::from_array(array_ref);
794
795 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
796 let actual_bit_width = block.expect_stat(Stat::BitWidth);
797 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
798
799 let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]);
800 let array_ref: ArrayRef = Arc::new(uint8_array);
801 let block = DataBlock::from_array(array_ref);
802
803 let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
804 let actual_bit_width = block.expect_stat(Stat::BitWidth);
805 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
806
807 let uint16_array = UInt16Array::from(vec![1, 2, 3]);
808 let array_ref: ArrayRef = Arc::new(uint16_array);
809 let block = DataBlock::from_array(array_ref);
810
811 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
812 let actual_bit_width = block.expect_stat(Stat::BitWidth);
813 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
814
815 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
816 let array_ref: ArrayRef = Arc::new(uint16_array);
817 let block = DataBlock::from_array(array_ref);
818
819 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
820 let actual_bit_width = block.expect_stat(Stat::BitWidth);
821 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
822
823 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
824 let array_ref: ArrayRef = Arc::new(uint16_array);
825 let block = DataBlock::from_array(array_ref);
826
827 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
828 let actual_bit_width = block.expect_stat(Stat::BitWidth);
829 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
830
831 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
832 let array_ref: ArrayRef = Arc::new(uint16_array);
833 let block = DataBlock::from_array(array_ref);
834
835 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
836 let actual_bit_width = block.expect_stat(Stat::BitWidth);
837 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
838
839 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
840 let array_ref: ArrayRef = Arc::new(uint16_array);
841 let block = DataBlock::from_array(array_ref);
842
843 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
844 let actual_bit_width = block.expect_stat(Stat::BitWidth);
845 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
846
847 let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]);
848 let array_ref: ArrayRef = Arc::new(uint16_array);
849 let block = DataBlock::from_array(array_ref);
850
851 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
852 let actual_bit_width = block.expect_stat(Stat::BitWidth);
853 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
854
855 let uint32_array = UInt32Array::from(vec![1, 2, 3]);
856 let array_ref: ArrayRef = Arc::new(uint32_array);
857 let block = DataBlock::from_array(array_ref);
858
859 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
860 let actual_bit_width = block.expect_stat(Stat::BitWidth);
861 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
862
863 let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
864 let array_ref: ArrayRef = Arc::new(uint32_array);
865 let block = DataBlock::from_array(array_ref);
866
867 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
868 let actual_bit_width = block.expect_stat(Stat::BitWidth);
869 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
870
871 let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
872 let array_ref: ArrayRef = Arc::new(uint32_array);
873 let block = DataBlock::from_array(array_ref);
874
875 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
876 let actual_bit_width = block.expect_stat(Stat::BitWidth);
877 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
878
879 let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]);
880 let array_ref: ArrayRef = Arc::new(uint32_array);
881 let block = DataBlock::from_array(array_ref);
882
883 let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
884 let actual_bit_width = block.expect_stat(Stat::BitWidth);
885 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
886
887 let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]);
888 let array_ref: ArrayRef = Arc::new(uint32_array);
889 let block = DataBlock::from_array(array_ref);
890
891 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
892 let actual_bit_width = block.expect_stat(Stat::BitWidth);
893 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
894
895 let uint64_array = UInt64Array::from(vec![1, 2, 3]);
896 let array_ref: ArrayRef = Arc::new(uint64_array);
897 let block = DataBlock::from_array(array_ref);
898
899 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
900 let actual_bit_width = block.expect_stat(Stat::BitWidth);
901 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
902
903 let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
904 let array_ref: ArrayRef = Arc::new(uint64_array);
905 let block = DataBlock::from_array(array_ref);
906
907 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
908 let actual_bit_width = block.expect_stat(Stat::BitWidth);
909 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
910
911 let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
912 let array_ref: ArrayRef = Arc::new(uint64_array);
913 let block = DataBlock::from_array(array_ref);
914
915 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
916 let actual_bit_width = block.expect_stat(Stat::BitWidth);
917 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
918
919 let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]);
920 let array_ref: ArrayRef = Arc::new(uint64_array);
921 let block = DataBlock::from_array(array_ref);
922
923 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
924 let actual_bit_width = block.expect_stat(Stat::BitWidth);
925 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
926
927 let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]);
928 let array_ref: ArrayRef = Arc::new(uint64_array);
929 let block = DataBlock::from_array(array_ref);
930
931 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
932 let actual_bit_width = block.expect_stat(Stat::BitWidth);
933 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
934 }
935
936 #[test]
937 fn test_bit_width_stat_more_than_1024() {
938 for data_type in [
939 DataType::Int8,
940 DataType::Int16,
941 DataType::Int32,
942 DataType::Int64,
943 ] {
944 let array1 = Int64Array::from(vec![3; 1024]);
945 let array2 = Int64Array::from(vec![8; 1024]);
946 let array3 = Int64Array::from(vec![-1; 10]);
947 let array1 = arrow_cast::cast(&array1, &data_type).unwrap();
948 let array2 = arrow_cast::cast(&array2, &data_type).unwrap();
949 let array3 = arrow_cast::cast(&array3, &data_type).unwrap();
950
951 let arrays: Vec<&dyn arrow::array::Array> =
952 vec![array1.as_ref(), array2.as_ref(), array3.as_ref()];
953 let concatenated = concat(&arrays).unwrap();
954 let block = DataBlock::from_array(concatenated.clone());
955
956 let expected_bit_width = Arc::new(UInt64Array::from(vec![
957 2,
958 4,
959 (data_type.byte_width() * 8) as u64,
960 ])) as ArrayRef;
961 let actual_bit_widths = block.expect_stat(Stat::BitWidth);
962 assert_eq!(actual_bit_widths.as_ref(), expected_bit_width.as_ref(),);
963 }
964 }
965
966 #[test]
967 fn test_bit_width_when_none() {
968 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
969 let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
970 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
971 let block = DataBlock::from_array(arr.clone());
972 assert!(block.get_stat(Stat::BitWidth).is_none(),);
973 }
974
975 #[test]
976 fn test_cardinality_variable_width_datablock() {
977 let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
978 let block = DataBlock::from_array(string_array);
979 let expected_cardinality = 2;
980 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
981 assert_eq!(actual_cardinality, expected_cardinality,);
982
983 let string_array = StringArray::from(vec![
984 Some("to be named by variables"),
985 Some("to be passed as arguments to procedures"),
986 Some("to be returned as values of procedures"),
987 ]);
988 let block = DataBlock::from_array(string_array);
989 let expected_cardinality = 3;
990 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
991
992 assert_eq!(actual_cardinality, expected_cardinality,);
993
994 let string_array = StringArray::from(vec![
995 Some("Samuel Eilenberg"),
996 Some("Saunders Mac Lane"),
997 Some("Samuel Eilenberg"),
998 ]);
999 let block = DataBlock::from_array(string_array);
1000 let expected_cardinality = 2;
1001 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1002 assert_eq!(actual_cardinality, expected_cardinality,);
1003
1004 let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
1005 let block = DataBlock::from_array(string_array);
1006 let expected_cardinality = 2;
1007 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1008 assert_eq!(actual_cardinality, expected_cardinality,);
1009
1010 let string_array = LargeStringArray::from(vec![
1011 Some("to be named by variables"),
1012 Some("to be passed as arguments to procedures"),
1013 Some("to be returned as values of procedures"),
1014 ]);
1015 let block = DataBlock::from_array(string_array);
1016 let expected_cardinality = 3;
1017 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1018 assert_eq!(actual_cardinality, expected_cardinality,);
1019
1020 let string_array = LargeStringArray::from(vec![
1021 Some("Samuel Eilenberg"),
1022 Some("Saunders Mac Lane"),
1023 Some("Samuel Eilenberg"),
1024 ]);
1025 let block = DataBlock::from_array(string_array);
1026 let expected_cardinality = 2;
1027 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
1028 assert_eq!(actual_cardinality, expected_cardinality,);
1029 }
1030
1031 #[test]
1032 fn test_max_length_variable_width_datablock() {
1033 let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
1034 let block = DataBlock::from_array(string_array.clone());
1035 let expected_max_length = string_array.value_length(0) as u64;
1036 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1037 assert_eq!(actual_max_length, expected_max_length);
1038
1039 let string_array = StringArray::from(vec![
1040 Some("to be named by variables"),
1041 Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"),
1043 ]);
1044 let block = DataBlock::from_array(string_array.clone());
1045 let expected_max_length = string_array.value_length(1) as u64;
1046 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1047 assert_eq!(actual_max_length, expected_max_length);
1048
1049 let string_array = StringArray::from(vec![
1050 Some("Samuel Eilenberg"),
1051 Some("Saunders Mac Lane"), Some("Samuel Eilenberg"),
1053 ]);
1054 let block = DataBlock::from_array(string_array.clone());
1055 let expected_max_length = string_array.value_length(1) as u64;
1056 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1057 assert_eq!(actual_max_length, expected_max_length);
1058
1059 let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
1060 let block = DataBlock::from_array(string_array.clone());
1061 let expected_max_length = string_array.value_length(1) as u64;
1062 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1063 assert_eq!(actual_max_length, expected_max_length);
1064
1065 let string_array = LargeStringArray::from(vec![
1066 Some("to be named by variables"),
1067 Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"),
1069 ]);
1070 let block = DataBlock::from_array(string_array.clone());
1071 let expected_max_length = string_array.value(1).len() as u64;
1072 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1073
1074 assert_eq!(actual_max_length, expected_max_length);
1075 }
1076
1077 #[test]
1078 fn test_run_count_stat() {
1079 let int32_array = Int32Array::from(vec![1, 1, 1, 2, 2, 2, 3, 3, 3]);
1081 let block = DataBlock::from_array(int32_array);
1082 let expected_run_count = 3;
1083 let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1084 assert_eq!(actual_run_count, expected_run_count);
1085
1086 let int32_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
1088 let block = DataBlock::from_array(int32_array);
1089 let expected_run_count = 5;
1090 let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1091 assert_eq!(actual_run_count, expected_run_count);
1092
1093 let int32_array = Int32Array::from(vec![1, 1, 2, 3, 3, 3, 4, 5, 5]);
1095 let block = DataBlock::from_array(int32_array);
1096 let expected_run_count = 5;
1097 let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1098 assert_eq!(actual_run_count, expected_run_count);
1099
1100 let int32_array = Int32Array::from(vec![42, 42, 42, 42, 42]);
1102 let block = DataBlock::from_array(int32_array);
1103 let expected_run_count = 1;
1104 let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1105 assert_eq!(actual_run_count, expected_run_count);
1106
1107 let uint8_array = UInt8Array::from(vec![1, 1, 2, 2, 3, 3]);
1109 let block = DataBlock::from_array(uint8_array);
1110 let expected_run_count = 3;
1111 let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1112 assert_eq!(actual_run_count, expected_run_count);
1113
1114 let int64_array = Int64Array::from(vec![100, 100, 200, 300, 300]);
1115 let block = DataBlock::from_array(int64_array);
1116 let expected_run_count = 3;
1117 let actual_run_count = block.expect_single_stat::<UInt64Type>(Stat::RunCount);
1118 assert_eq!(actual_run_count, expected_run_count);
1119 }
1120}