1use std::{
5 fmt::{self},
6 hash::{Hash, RandomState},
7 sync::Arc,
8};
9
10use arrow::{array::AsArray, datatypes::UInt64Type};
11use arrow_array::{Array, ArrowPrimitiveType, UInt64Array};
12use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
13use num_traits::PrimInt;
14
15use crate::data::{
16 AllNullDataBlock, DataBlock, DictionaryDataBlock, FixedSizeListBlock, FixedWidthDataBlock,
17 NullableDataBlock, OpaqueBlock, StructDataBlock, VariableWidthBlock,
18};
19
20#[derive(Clone, Copy, PartialEq, Eq, Hash)]
21pub enum Stat {
22 BitWidth,
23 DataSize,
24 Cardinality,
25 FixedSize,
26 NullCount,
27 MaxLength,
28}
29
30impl fmt::Debug for Stat {
31 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32 match self {
33 Self::BitWidth => write!(f, "BitWidth"),
34 Self::DataSize => write!(f, "DataSize"),
35 Self::Cardinality => write!(f, "Cardinality"),
36 Self::FixedSize => write!(f, "FixedSize"),
37 Self::NullCount => write!(f, "NullCount"),
38 Self::MaxLength => write!(f, "MaxLength"),
39 }
40 }
41}
42
43impl fmt::Display for Stat {
44 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45 write!(f, "{:?}", self)
46 }
47}
48
49pub trait ComputeStat {
50 fn compute_stat(&mut self);
51}
52
53impl ComputeStat for DataBlock {
54 fn compute_stat(&mut self) {
55 match self {
56 Self::Empty() => {}
57 Self::Constant(_) => {}
58 Self::AllNull(_) => {}
59 Self::Nullable(data_block) => data_block.data.compute_stat(),
60 Self::FixedWidth(data_block) => data_block.compute_stat(),
61 Self::FixedSizeList(data_block) => data_block.compute_stat(),
62 Self::VariableWidth(data_block) => data_block.compute_stat(),
63 Self::Opaque(data_block) => data_block.compute_stat(),
64 Self::Struct(data_block) => data_block.compute_stat(),
65 Self::Dictionary(_) => {}
66 }
67 }
68}
69
70impl ComputeStat for VariableWidthBlock {
71 fn compute_stat(&mut self) {
72 if !self.block_info.0.read().unwrap().is_empty() {
73 panic!("compute_stat should only be called once during DataBlock construction");
74 }
75 let data_size = self.data_size();
76 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
77
78 let cardinality_array = self.cardinality();
79
80 let max_length_array = self.max_length();
81
82 let mut info = self.block_info.0.write().unwrap();
83 info.insert(Stat::DataSize, data_size_array);
84 info.insert(Stat::Cardinality, cardinality_array);
85 info.insert(Stat::MaxLength, max_length_array);
86 }
87}
88
89impl ComputeStat for FixedWidthDataBlock {
90 fn compute_stat(&mut self) {
91 let data_size = self.data_size();
93 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
94
95 let max_bit_widths = self.max_bit_widths();
97
98 let max_len = self.bits_per_value / 8;
100 let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
101
102 let cardidinality_array = if self.bits_per_value == 128 {
103 Some(self.cardinality())
104 } else {
105 None
106 };
107
108 let mut info = self.block_info.0.write().unwrap();
109 info.insert(Stat::DataSize, data_size_array);
110 info.insert(Stat::BitWidth, max_bit_widths);
111 info.insert(Stat::MaxLength, max_len_array);
112 if let Some(cardinality_array) = cardidinality_array {
113 info.insert(Stat::Cardinality, cardinality_array);
114 }
115 }
116}
117
118impl ComputeStat for FixedSizeListBlock {
119 fn compute_stat(&mut self) {
120 self.child.compute_stat();
128 }
129}
130
131impl ComputeStat for OpaqueBlock {
132 fn compute_stat(&mut self) {
133 let data_size = self.data_size();
135 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
136 let mut info = self.block_info.0.write().unwrap();
137 info.insert(Stat::DataSize, data_size_array);
138 }
139}
140
141pub trait GetStat: fmt::Debug {
142 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>>;
143
144 fn expect_stat(&self, stat: Stat) -> Arc<dyn Array> {
145 self.get_stat(stat)
146 .unwrap_or_else(|| panic!("{:?} DataBlock does not have `{}` statistics.", self, stat))
147 }
148
149 fn expect_single_stat<T: ArrowPrimitiveType>(&self, stat: Stat) -> T::Native {
150 let stat_value = self.expect_stat(stat);
151 let stat_value = stat_value.as_primitive::<T>();
152 if stat_value.len() != 1 {
153 panic!(
154 "{:?} DataBlock does not have exactly one value for `{} statistics.",
155 self, stat
156 );
157 }
158 stat_value.value(0)
159 }
160}
161
162impl GetStat for DataBlock {
163 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
164 match self {
165 Self::Empty() => None,
166 Self::Constant(_) => None,
167 Self::AllNull(data_block) => data_block.get_stat(stat),
168 Self::Nullable(data_block) => data_block.get_stat(stat),
169 Self::FixedWidth(data_block) => data_block.get_stat(stat),
170 Self::FixedSizeList(data_block) => data_block.get_stat(stat),
171 Self::VariableWidth(data_block) => data_block.get_stat(stat),
172 Self::Opaque(data_block) => data_block.get_stat(stat),
173 Self::Struct(data_block) => data_block.get_stat(stat),
174 Self::Dictionary(data_block) => data_block.get_stat(stat),
175 }
176 }
177}
178
179impl GetStat for NullableDataBlock {
181 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
184 self.data.get_stat(stat)
185 }
186}
187
188impl GetStat for VariableWidthBlock {
189 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
190 let block_info = self.block_info.0.read().unwrap();
191
192 if block_info.is_empty() {
193 panic!("get_stat should be called after statistics are computed.");
194 }
195 block_info.get(&stat).cloned()
196 }
197}
198
199impl GetStat for FixedSizeListBlock {
200 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
201 let child_stat = self.child.get_stat(stat);
202 match stat {
203 Stat::MaxLength => child_stat.map(|max_length| {
204 let max_length = max_length.as_primitive::<UInt64Type>().value(0);
207 Arc::new(UInt64Array::from(vec![max_length * self.dimension])) as Arc<dyn Array>
208 }),
209 _ => child_stat,
210 }
211 }
212}
213
214impl VariableWidthBlock {
215 fn cardinality(&mut self) -> Arc<dyn Array> {
218 const PRECISION: u8 = 4;
219 let mut hll: HyperLogLogPlus<&[u8], xxhash_rust::xxh3::Xxh3Builder> =
228 HyperLogLogPlus::new(PRECISION, xxhash_rust::xxh3::Xxh3Builder::default()).unwrap();
229
230 match self.bits_per_offset {
231 32 => {
232 let offsets_ref = self.offsets.borrow_to_typed_slice::<u32>();
233 let offsets: &[u32] = offsets_ref.as_ref();
234
235 offsets
236 .iter()
237 .zip(offsets.iter().skip(1))
238 .for_each(|(&start, &end)| {
239 hll.insert(&self.data[start as usize..end as usize]);
240 });
241 let cardinality = hll.count() as u64;
242 Arc::new(UInt64Array::from(vec![cardinality]))
243 }
244 64 => {
245 let offsets_ref = self.offsets.borrow_to_typed_slice::<u64>();
246 let offsets: &[u64] = offsets_ref.as_ref();
247
248 offsets
249 .iter()
250 .zip(offsets.iter().skip(1))
251 .for_each(|(&start, &end)| {
252 hll.insert(&self.data[start as usize..end as usize]);
253 });
254
255 let cardinality = hll.count() as u64;
256 Arc::new(UInt64Array::from(vec![cardinality]))
257 }
258 _ => {
259 unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64")
260 }
261 }
262 }
263
264 fn max_length(&mut self) -> Arc<dyn Array> {
265 match self.bits_per_offset {
266 32 => {
267 let offsets = self.offsets.borrow_to_typed_slice::<u32>();
268 let offsets = offsets.as_ref();
269 let max_len = offsets
270 .windows(2)
271 .map(|pair| pair[1] - pair[0])
272 .max()
273 .unwrap_or(0);
274 Arc::new(UInt64Array::from(vec![max_len as u64]))
275 }
276 64 => {
277 let offsets = self.offsets.borrow_to_typed_slice::<u64>();
278 let offsets = offsets.as_ref();
279 let max_len = offsets
280 .windows(2)
281 .map(|pair| pair[1] - pair[0])
282 .max()
283 .unwrap_or(0);
284 Arc::new(UInt64Array::from(vec![max_len]))
285 }
286 _ => {
287 unreachable!("the type of offsets in VariableWidth can only be u32 or u64");
288 }
289 }
290 }
291}
292
293impl GetStat for AllNullDataBlock {
294 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
295 match stat {
296 Stat::NullCount => {
297 let null_count = self.num_values;
298 Some(Arc::new(UInt64Array::from(vec![null_count])))
299 }
300 Stat::DataSize => Some(Arc::new(UInt64Array::from(vec![0]))),
301 _ => None,
302 }
303 }
304}
305
306impl GetStat for FixedWidthDataBlock {
307 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
308 let block_info = self.block_info.0.read().unwrap();
309
310 if block_info.is_empty() {
311 panic!("get_stat should be called after statistics are computed.");
312 }
313 block_info.get(&stat).cloned()
314 }
315}
316
317impl FixedWidthDataBlock {
318 fn max_bit_widths(&mut self) -> Arc<dyn Array> {
319 assert!(self.num_values > 0);
320
321 const CHUNK_SIZE: usize = 1024;
322
323 fn calculate_max_bit_width<T: PrimInt>(slice: &[T], bits_per_value: u64) -> Vec<u64> {
324 slice
325 .chunks(CHUNK_SIZE)
326 .map(|chunk| {
327 let max_value = chunk.iter().fold(T::zero(), |acc, &x| acc | x);
328 bits_per_value - max_value.leading_zeros() as u64
329 })
330 .collect()
331 }
332
333 match self.bits_per_value {
334 8 => {
335 let u8_slice = self.data.borrow_to_typed_slice::<u8>();
336 let u8_slice = u8_slice.as_ref();
337 Arc::new(UInt64Array::from(calculate_max_bit_width(
338 u8_slice,
339 self.bits_per_value,
340 )))
341 }
342 16 => {
343 let u16_slice = self.data.borrow_to_typed_slice::<u16>();
344 let u16_slice = u16_slice.as_ref();
345 Arc::new(UInt64Array::from(calculate_max_bit_width(
346 u16_slice,
347 self.bits_per_value,
348 )))
349 }
350 32 => {
351 let u32_slice = self.data.borrow_to_typed_slice::<u32>();
352 let u32_slice = u32_slice.as_ref();
353 Arc::new(UInt64Array::from(calculate_max_bit_width(
354 u32_slice,
355 self.bits_per_value,
356 )))
357 }
358 64 => {
359 let u64_slice = self.data.borrow_to_typed_slice::<u64>();
360 let u64_slice = u64_slice.as_ref();
361 Arc::new(UInt64Array::from(calculate_max_bit_width(
362 u64_slice,
363 self.bits_per_value,
364 )))
365 }
366 _ => Arc::new(UInt64Array::from(vec![self.bits_per_value])),
367 }
368 }
369
370 fn cardinality(&mut self) -> Arc<dyn Array> {
371 match self.bits_per_value {
372 128 => {
373 let u128_slice_ref = self.data.borrow_to_typed_slice::<u128>();
374 let u128_slice = u128_slice_ref.as_ref();
375
376 const PRECISION: u8 = 4;
377 let mut hll: HyperLogLogPlus<u128, RandomState> =
378 HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
379 for val in u128_slice {
380 hll.insert(val);
381 }
382 let cardinality = hll.count() as u64;
383 Arc::new(UInt64Array::from(vec![cardinality]))
384 }
385 _ => unreachable!(),
386 }
387 }
388}
389
390impl GetStat for OpaqueBlock {
391 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
392 let block_info = self.block_info.0.read().unwrap();
393
394 if block_info.is_empty() {
395 panic!("get_stat should be called after statistics are computed.");
396 }
397 block_info.get(&stat).cloned()
398 }
399}
400
401impl GetStat for DictionaryDataBlock {
402 fn get_stat(&self, _stat: Stat) -> Option<Arc<dyn Array>> {
403 None
404 }
405}
406
407impl GetStat for StructDataBlock {
408 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
409 let block_info = self.block_info.0.read().unwrap();
410 if block_info.is_empty() {
411 panic!("get_stat should be called after statistics are computed.")
412 }
413 block_info.get(&stat).cloned()
414 }
415}
416
417impl ComputeStat for StructDataBlock {
418 fn compute_stat(&mut self) {
419 let data_size = self.data_size();
420 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
421
422 let max_len = self
423 .children
424 .iter()
425 .map(|child| child.expect_single_stat::<UInt64Type>(Stat::MaxLength))
426 .sum::<u64>();
427 let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
428
429 let mut info = self.block_info.0.write().unwrap();
430 info.insert(Stat::DataSize, data_size_array);
431 info.insert(Stat::MaxLength, max_len_array);
432 }
433}
434
435#[cfg(test)]
436mod tests {
437 use std::sync::Arc;
438
439 use arrow_array::{
440 ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
441 UInt16Array, UInt32Array, UInt64Array, UInt8Array,
442 };
443 use arrow_schema::{DataType, Field};
444 use lance_arrow::DataTypeExt;
445 use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED};
446 use rand::SeedableRng;
447
448 use crate::statistics::{GetStat, Stat};
449
450 use super::DataBlock;
451
452 use arrow::{
453 array::AsArray,
454 compute::concat,
455 datatypes::{Int32Type, UInt64Type},
456 };
457 use arrow_array::Array;
458 #[test]
459 fn test_data_size_stat() {
460 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
461 let mut gen = array::rand::<Int32Type>().with_nulls(&[false, false, false]);
462 let arr1 = gen.generate(RowCount::from(3), &mut rng).unwrap();
463 let arr2 = gen.generate(RowCount::from(3), &mut rng).unwrap();
464 let arr3 = gen.generate(RowCount::from(3), &mut rng).unwrap();
465 let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9);
466
467 let concatenated_array = concat(&[
468 &*Arc::new(arr1.clone()) as &dyn Array,
469 &*Arc::new(arr2.clone()) as &dyn Array,
470 &*Arc::new(arr3.clone()) as &dyn Array,
471 ])
472 .unwrap();
473
474 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
475
476 let total_buffer_size: usize = concatenated_array
477 .to_data()
478 .buffers()
479 .iter()
480 .map(|buffer| buffer.len())
481 .sum();
482 assert!(data_size == total_buffer_size as u64);
483
484 let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
486 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
487 let block = DataBlock::from_array(arr.clone());
488 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
489
490 let total_buffer_size: usize = arr
491 .to_data()
492 .buffers()
493 .iter()
494 .map(|buffer| buffer.len())
495 .sum();
496 assert!(data_size == total_buffer_size as u64);
497
498 let fields = vec![
500 Arc::new(Field::new("int_field", DataType::Int32, false)),
501 Arc::new(Field::new("float_field", DataType::Float32, false)),
502 ]
503 .into();
504
505 let mut gen = lance_datagen::array::rand_type(&DataType::Struct(fields));
506 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
507 let block = DataBlock::from_array(arr.clone());
508 let (_, arr_parts, _) = arr.as_struct().clone().into_parts();
509 let total_buffer_size: usize = arr_parts
510 .iter()
511 .map(|arr| {
512 arr.to_data()
513 .buffers()
514 .iter()
515 .map(|buffer| buffer.len())
516 .sum::<usize>()
517 })
518 .sum();
519 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
520 assert!(data_size == total_buffer_size as u64);
521
522 let mut gen = array::rand_type(&DataType::Dictionary(
524 Box::new(DataType::Int32),
525 Box::new(DataType::Utf8),
526 ));
527 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
528 let block = DataBlock::from_array(arr.clone());
529 assert!(block.get_stat(Stat::DataSize).is_none());
530
531 let mut gen = array::rand::<Int32Type>().with_nulls(&[false, true, false]);
532 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
533 let block = DataBlock::from_array(arr.clone());
534 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
535 let total_buffer_size: usize = arr
536 .to_data()
537 .buffers()
538 .iter()
539 .map(|buffer| buffer.len())
540 .sum();
541
542 assert!(data_size == total_buffer_size as u64);
543 }
544
545 #[test]
546 fn test_bit_width_stat_for_integers() {
547 let int8_array = Int8Array::from(vec![1, 2, 3]);
548 let array_ref: ArrayRef = Arc::new(int8_array);
549 let block = DataBlock::from_array(array_ref);
550
551 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
552 let actual_bit_width = block.expect_stat(Stat::BitWidth);
553
554 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
555
556 let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
557 let array_ref: ArrayRef = Arc::new(int8_array);
558 let block = DataBlock::from_array(array_ref);
559
560 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
561 let actual_bit_width = block.expect_stat(Stat::BitWidth);
562 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
563
564 let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
565 let array_ref: ArrayRef = Arc::new(int8_array);
566 let block = DataBlock::from_array(array_ref);
567
568 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
569 let actual_bit_width = block.expect_stat(Stat::BitWidth);
570 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
571
572 let int8_array = Int8Array::from(vec![-1, 2, 3]);
573 let array_ref: ArrayRef = Arc::new(int8_array);
574 let block = DataBlock::from_array(array_ref);
575
576 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
577 let actual_bit_width = block.expect_stat(Stat::BitWidth);
578 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
579
580 let int16_array = Int16Array::from(vec![1, 2, 3]);
581 let array_ref: ArrayRef = Arc::new(int16_array);
582 let block = DataBlock::from_array(array_ref);
583
584 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
585 let actual_bit_width = block.expect_stat(Stat::BitWidth);
586 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
587
588 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
589 let array_ref: ArrayRef = Arc::new(int16_array);
590 let block = DataBlock::from_array(array_ref);
591
592 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
593 let actual_bit_width = block.expect_stat(Stat::BitWidth);
594 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
595
596 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
597 let array_ref: ArrayRef = Arc::new(int16_array);
598 let block = DataBlock::from_array(array_ref);
599
600 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
601 let actual_bit_width = block.expect_stat(Stat::BitWidth);
602 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
603
604 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
605 let array_ref: ArrayRef = Arc::new(int16_array);
606 let block = DataBlock::from_array(array_ref);
607
608 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
609 let actual_bit_width = block.expect_stat(Stat::BitWidth);
610 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
611
612 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
613 let array_ref: ArrayRef = Arc::new(int16_array);
614 let block = DataBlock::from_array(array_ref);
615
616 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
617 let actual_bit_width = block.expect_stat(Stat::BitWidth);
618 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
619
620 let int16_array = Int16Array::from(vec![-1, 2, 3]);
621 let array_ref: ArrayRef = Arc::new(int16_array);
622 let block = DataBlock::from_array(array_ref);
623
624 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
625 let actual_bit_width = block.expect_stat(Stat::BitWidth);
626 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
627
628 let int32_array = Int32Array::from(vec![1, 2, 3]);
629 let array_ref: ArrayRef = Arc::new(int32_array);
630 let block = DataBlock::from_array(array_ref);
631
632 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
633 let actual_bit_width = block.expect_stat(Stat::BitWidth);
634 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
635
636 let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
637 let array_ref: ArrayRef = Arc::new(int32_array);
638 let block = DataBlock::from_array(array_ref);
639
640 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
641 let actual_bit_width = block.expect_stat(Stat::BitWidth);
642 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
643
644 let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
645 let array_ref: ArrayRef = Arc::new(int32_array);
646 let block = DataBlock::from_array(array_ref);
647
648 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
649 let actual_bit_width = block.expect_stat(Stat::BitWidth);
650 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
651
652 let int32_array = Int32Array::from(vec![-1, 2, 3]);
653 let array_ref: ArrayRef = Arc::new(int32_array);
654 let block = DataBlock::from_array(array_ref);
655
656 let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
657 let actual_bit_width = block.expect_stat(Stat::BitWidth);
658 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
659
660 let int32_array = Int32Array::from(vec![-1, 2, 3, -88]);
661 let array_ref: ArrayRef = Arc::new(int32_array);
662 let block = DataBlock::from_array(array_ref);
663
664 let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
665 let actual_bit_width = block.expect_stat(Stat::BitWidth);
666 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
667
668 let int64_array = Int64Array::from(vec![1, 2, 3]);
669 let array_ref: ArrayRef = Arc::new(int64_array);
670 let block = DataBlock::from_array(array_ref);
671
672 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
673 let actual_bit_width = block.expect_stat(Stat::BitWidth);
674 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
675
676 let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
677 let array_ref: ArrayRef = Arc::new(int64_array);
678 let block = DataBlock::from_array(array_ref);
679
680 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
681 let actual_bit_width = block.expect_stat(Stat::BitWidth);
682 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
683
684 let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
685 let array_ref: ArrayRef = Arc::new(int64_array);
686 let block = DataBlock::from_array(array_ref);
687
688 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
689 let actual_bit_width = block.expect_stat(Stat::BitWidth);
690 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
691
692 let int64_array = Int64Array::from(vec![-1, 2, 3]);
693 let array_ref: ArrayRef = Arc::new(int64_array);
694 let block = DataBlock::from_array(array_ref);
695
696 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
697 let actual_bit_width = block.expect_stat(Stat::BitWidth);
698 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
699
700 let int64_array = Int64Array::from(vec![-1, 2, 3, -88]);
701 let array_ref: ArrayRef = Arc::new(int64_array);
702 let block = DataBlock::from_array(array_ref);
703
704 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
705 let actual_bit_width = block.expect_stat(Stat::BitWidth);
706 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
707
708 let uint8_array = UInt8Array::from(vec![1, 2, 3]);
709 let array_ref: ArrayRef = Arc::new(uint8_array);
710 let block = DataBlock::from_array(array_ref);
711
712 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
713 let actual_bit_width = block.expect_stat(Stat::BitWidth);
714 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
715
716 let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
717 let array_ref: ArrayRef = Arc::new(uint8_array);
718 let block = DataBlock::from_array(array_ref);
719
720 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
721 let actual_bit_width = block.expect_stat(Stat::BitWidth);
722 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
723
724 let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
725 let array_ref: ArrayRef = Arc::new(uint8_array);
726 let block = DataBlock::from_array(array_ref);
727
728 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
729 let actual_bit_width = block.expect_stat(Stat::BitWidth);
730 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
731
732 let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]);
733 let array_ref: ArrayRef = Arc::new(uint8_array);
734 let block = DataBlock::from_array(array_ref);
735
736 let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
737 let actual_bit_width = block.expect_stat(Stat::BitWidth);
738 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
739
740 let uint16_array = UInt16Array::from(vec![1, 2, 3]);
741 let array_ref: ArrayRef = Arc::new(uint16_array);
742 let block = DataBlock::from_array(array_ref);
743
744 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
745 let actual_bit_width = block.expect_stat(Stat::BitWidth);
746 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
747
748 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
749 let array_ref: ArrayRef = Arc::new(uint16_array);
750 let block = DataBlock::from_array(array_ref);
751
752 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
753 let actual_bit_width = block.expect_stat(Stat::BitWidth);
754 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
755
756 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
757 let array_ref: ArrayRef = Arc::new(uint16_array);
758 let block = DataBlock::from_array(array_ref);
759
760 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
761 let actual_bit_width = block.expect_stat(Stat::BitWidth);
762 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
763
764 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
765 let array_ref: ArrayRef = Arc::new(uint16_array);
766 let block = DataBlock::from_array(array_ref);
767
768 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
769 let actual_bit_width = block.expect_stat(Stat::BitWidth);
770 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
771
772 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
773 let array_ref: ArrayRef = Arc::new(uint16_array);
774 let block = DataBlock::from_array(array_ref);
775
776 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
777 let actual_bit_width = block.expect_stat(Stat::BitWidth);
778 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
779
780 let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]);
781 let array_ref: ArrayRef = Arc::new(uint16_array);
782 let block = DataBlock::from_array(array_ref);
783
784 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
785 let actual_bit_width = block.expect_stat(Stat::BitWidth);
786 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
787
788 let uint32_array = UInt32Array::from(vec![1, 2, 3]);
789 let array_ref: ArrayRef = Arc::new(uint32_array);
790 let block = DataBlock::from_array(array_ref);
791
792 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
793 let actual_bit_width = block.expect_stat(Stat::BitWidth);
794 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
795
796 let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
797 let array_ref: ArrayRef = Arc::new(uint32_array);
798 let block = DataBlock::from_array(array_ref);
799
800 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
801 let actual_bit_width = block.expect_stat(Stat::BitWidth);
802 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
803
804 let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
805 let array_ref: ArrayRef = Arc::new(uint32_array);
806 let block = DataBlock::from_array(array_ref);
807
808 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
809 let actual_bit_width = block.expect_stat(Stat::BitWidth);
810 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
811
812 let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]);
813 let array_ref: ArrayRef = Arc::new(uint32_array);
814 let block = DataBlock::from_array(array_ref);
815
816 let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
817 let actual_bit_width = block.expect_stat(Stat::BitWidth);
818 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
819
820 let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]);
821 let array_ref: ArrayRef = Arc::new(uint32_array);
822 let block = DataBlock::from_array(array_ref);
823
824 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
825 let actual_bit_width = block.expect_stat(Stat::BitWidth);
826 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
827
828 let uint64_array = UInt64Array::from(vec![1, 2, 3]);
829 let array_ref: ArrayRef = Arc::new(uint64_array);
830 let block = DataBlock::from_array(array_ref);
831
832 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
833 let actual_bit_width = block.expect_stat(Stat::BitWidth);
834 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
835
836 let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
837 let array_ref: ArrayRef = Arc::new(uint64_array);
838 let block = DataBlock::from_array(array_ref);
839
840 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
841 let actual_bit_width = block.expect_stat(Stat::BitWidth);
842 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
843
844 let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
845 let array_ref: ArrayRef = Arc::new(uint64_array);
846 let block = DataBlock::from_array(array_ref);
847
848 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
849 let actual_bit_width = block.expect_stat(Stat::BitWidth);
850 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
851
852 let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]);
853 let array_ref: ArrayRef = Arc::new(uint64_array);
854 let block = DataBlock::from_array(array_ref);
855
856 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
857 let actual_bit_width = block.expect_stat(Stat::BitWidth);
858 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
859
860 let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]);
861 let array_ref: ArrayRef = Arc::new(uint64_array);
862 let block = DataBlock::from_array(array_ref);
863
864 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
865 let actual_bit_width = block.expect_stat(Stat::BitWidth);
866 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
867 }
868
869 #[test]
870 fn test_bit_width_stat_more_than_1024() {
871 for data_type in [
872 DataType::Int8,
873 DataType::Int16,
874 DataType::Int32,
875 DataType::Int64,
876 ] {
877 let array1 = Int64Array::from(vec![3; 1024]);
878 let array2 = Int64Array::from(vec![8; 1024]);
879 let array3 = Int64Array::from(vec![-1; 10]);
880 let array1 = arrow_cast::cast(&array1, &data_type).unwrap();
881 let array2 = arrow_cast::cast(&array2, &data_type).unwrap();
882 let array3 = arrow_cast::cast(&array3, &data_type).unwrap();
883
884 let arrays: Vec<&dyn arrow::array::Array> =
885 vec![array1.as_ref(), array2.as_ref(), array3.as_ref()];
886 let concatenated = concat(&arrays).unwrap();
887 let block = DataBlock::from_array(concatenated.clone());
888
889 let expected_bit_width = Arc::new(UInt64Array::from(vec![
890 2,
891 4,
892 (data_type.byte_width() * 8) as u64,
893 ])) as ArrayRef;
894 let actual_bit_widths = block.expect_stat(Stat::BitWidth);
895 assert_eq!(actual_bit_widths.as_ref(), expected_bit_width.as_ref(),);
896 }
897 }
898
899 #[test]
900 fn test_bit_width_when_none() {
901 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
902 let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
903 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
904 let block = DataBlock::from_array(arr.clone());
905 assert!(block.get_stat(Stat::BitWidth).is_none(),);
906 }
907
908 #[test]
909 fn test_cardinality_variable_width_datablock() {
910 let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
911 let block = DataBlock::from_array(string_array);
912 let expected_cardinality = 2;
913 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
914 assert_eq!(actual_cardinality, expected_cardinality,);
915
916 let string_array = StringArray::from(vec![
917 Some("to be named by variables"),
918 Some("to be passed as arguments to procedures"),
919 Some("to be returned as values of procedures"),
920 ]);
921 let block = DataBlock::from_array(string_array);
922 let expected_cardinality = 3;
923 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
924
925 assert_eq!(actual_cardinality, expected_cardinality,);
926
927 let string_array = StringArray::from(vec![
928 Some("Samuel Eilenberg"),
929 Some("Saunders Mac Lane"),
930 Some("Samuel Eilenberg"),
931 ]);
932 let block = DataBlock::from_array(string_array);
933 let expected_cardinality = 2;
934 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
935 assert_eq!(actual_cardinality, expected_cardinality,);
936
937 let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
938 let block = DataBlock::from_array(string_array);
939 let expected_cardinality = 2;
940 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
941 assert_eq!(actual_cardinality, expected_cardinality,);
942
943 let string_array = LargeStringArray::from(vec![
944 Some("to be named by variables"),
945 Some("to be passed as arguments to procedures"),
946 Some("to be returned as values of procedures"),
947 ]);
948 let block = DataBlock::from_array(string_array);
949 let expected_cardinality = 3;
950 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
951 assert_eq!(actual_cardinality, expected_cardinality,);
952
953 let string_array = LargeStringArray::from(vec![
954 Some("Samuel Eilenberg"),
955 Some("Saunders Mac Lane"),
956 Some("Samuel Eilenberg"),
957 ]);
958 let block = DataBlock::from_array(string_array);
959 let expected_cardinality = 2;
960 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
961 assert_eq!(actual_cardinality, expected_cardinality,);
962 }
963
964 #[test]
965 fn test_max_length_variable_width_datablock() {
966 let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
967 let block = DataBlock::from_array(string_array.clone());
968 let expected_max_length = string_array.value_length(0) as u64;
969 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
970 assert_eq!(actual_max_length, expected_max_length);
971
972 let string_array = StringArray::from(vec![
973 Some("to be named by variables"),
974 Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"),
976 ]);
977 let block = DataBlock::from_array(string_array.clone());
978 let expected_max_length = string_array.value_length(1) as u64;
979 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
980 assert_eq!(actual_max_length, expected_max_length);
981
982 let string_array = StringArray::from(vec![
983 Some("Samuel Eilenberg"),
984 Some("Saunders Mac Lane"), Some("Samuel Eilenberg"),
986 ]);
987 let block = DataBlock::from_array(string_array.clone());
988 let expected_max_length = string_array.value_length(1) as u64;
989 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
990 assert_eq!(actual_max_length, expected_max_length);
991
992 let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
993 let block = DataBlock::from_array(string_array.clone());
994 let expected_max_length = string_array.value_length(1) as u64;
995 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
996 assert_eq!(actual_max_length, expected_max_length);
997
998 let string_array = LargeStringArray::from(vec![
999 Some("to be named by variables"),
1000 Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"),
1002 ]);
1003 let block = DataBlock::from_array(string_array.clone());
1004 let expected_max_length = string_array.value(1).len() as u64;
1005 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
1006
1007 assert_eq!(actual_max_length, expected_max_length);
1008 }
1009}