1use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
21use arrow::array::*;
22#[cfg(not(feature = "force_hash_collisions"))]
23use arrow::compute::take;
24use arrow::datatypes::*;
25#[cfg(not(feature = "force_hash_collisions"))]
26use arrow::{downcast_dictionary_array, downcast_primitive_array};
27use foldhash::fast::FixedState;
28#[cfg(not(feature = "force_hash_collisions"))]
29use itertools::Itertools;
30#[cfg(not(feature = "force_hash_collisions"))]
31use std::collections::HashMap;
32use std::hash::{BuildHasher, Hash, Hasher};
33
34pub type RandomState = FixedState;
36
37#[cfg(not(feature = "force_hash_collisions"))]
38use crate::cast::{
39 as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
40 as_generic_binary_array, as_large_list_array, as_large_list_view_array,
41 as_list_array, as_list_view_array, as_map_array, as_string_array,
42 as_string_view_array, as_struct_array, as_union_array,
43};
44use crate::error::Result;
45use crate::error::{_internal_datafusion_err, _internal_err};
46use std::cell::RefCell;
47
48#[inline]
50pub fn combine_hashes(l: u64, r: u64) -> u64 {
51 let hash = (17 * 37u64).wrapping_add(l);
52 hash.wrapping_mul(37).wrapping_add(r)
53}
54
55const MAX_BUFFER_SIZE: usize = 524_288;
60
61thread_local! {
62 static HASH_BUFFER: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
67}
68
69pub fn with_hashes<I, T, F, R>(
101 arrays: I,
102 random_state: &RandomState,
103 callback: F,
104) -> Result<R>
105where
106 I: IntoIterator<Item = T>,
107 T: AsDynArray,
108 F: FnOnce(&[u64]) -> Result<R>,
109{
110 let mut iter = arrays.into_iter().peekable();
112
113 let required_size = match iter.peek() {
115 Some(arr) => arr.as_dyn_array().len(),
116 None => return _internal_err!("with_hashes requires at least one array"),
117 };
118
119 HASH_BUFFER.try_with(|cell| {
120 let mut buffer = cell.try_borrow_mut()
121 .map_err(|_| _internal_datafusion_err!("with_hashes cannot be called reentrantly on the same thread"))?;
122
123 buffer.clear();
125 buffer.resize(required_size, 0);
126
127 create_hashes(iter, random_state, &mut buffer[..required_size])?;
129
130 let result = callback(&buffer[..required_size])?;
132
133 if buffer.capacity() > MAX_BUFFER_SIZE {
135 buffer.truncate(MAX_BUFFER_SIZE);
136 buffer.shrink_to_fit();
137 }
138
139 Ok(result)
140 }).map_err(|_| _internal_datafusion_err!("with_hashes cannot access thread-local storage during or after thread destruction"))?
141}
142
143#[cfg(not(feature = "force_hash_collisions"))]
144fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
145 if mul_col {
146 hashes_buffer.iter_mut().for_each(|hash| {
147 *hash = combine_hashes(random_state.hash_one(1), *hash);
149 })
150 } else {
151 hashes_buffer.iter_mut().for_each(|hash| {
152 *hash = random_state.hash_one(1);
153 })
154 }
155}
156
157pub trait HashValue {
158 fn hash_one(&self, state: &RandomState) -> u64;
159 fn hash_write(&self, hasher: &mut impl Hasher);
161}
162
163impl<T: HashValue + ?Sized> HashValue for &T {
164 fn hash_one(&self, state: &RandomState) -> u64 {
165 T::hash_one(self, state)
166 }
167 fn hash_write(&self, hasher: &mut impl Hasher) {
168 T::hash_write(self, hasher)
169 }
170}
171
172macro_rules! hash_value {
173 ($($t:ty),+) => {
174 $(impl HashValue for $t {
175 fn hash_one(&self, state: &RandomState) -> u64 {
176 state.hash_one(self)
177 }
178 fn hash_write(&self, hasher: &mut impl Hasher) {
179 Hash::hash(self, hasher)
180 }
181 })+
182 };
183}
184hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64, u128);
185hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
186
187macro_rules! hash_float_value {
188 ($(($t:ty, $i:ty)),+) => {
189 $(impl HashValue for $t {
190 fn hash_one(&self, state: &RandomState) -> u64 {
191 state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
192 }
193 fn hash_write(&self, hasher: &mut impl Hasher) {
194 hasher.write(&self.to_ne_bytes())
195 }
196 })+
197 };
198}
199hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
200
201#[cfg(not(feature = "force_hash_collisions"))]
205#[inline]
206fn seeded_state(seed: u64) -> foldhash::fast::SeedableRandomState {
207 foldhash::fast::SeedableRandomState::with_seed(
208 seed,
209 foldhash::SharedSeed::global_fixed(),
210 )
211}
212
213#[cfg(not(feature = "force_hash_collisions"))]
217fn hash_array_primitive<T>(
218 array: &PrimitiveArray<T>,
219 random_state: &RandomState,
220 hashes_buffer: &mut [u64],
221 rehash: bool,
222) where
223 T: ArrowPrimitiveType<Native: HashValue>,
224{
225 assert_eq!(
226 hashes_buffer.len(),
227 array.len(),
228 "hashes_buffer and array should be of equal length"
229 );
230
231 if array.null_count() == 0 {
232 if rehash {
233 for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
234 let mut hasher = seeded_state(*hash).build_hasher();
235 value.hash_write(&mut hasher);
236 *hash = hasher.finish();
237 }
238 } else {
239 for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
240 *hash = value.hash_one(random_state);
241 }
242 }
243 } else if rehash {
244 for i in array.nulls().unwrap().valid_indices() {
245 let value = unsafe { array.value_unchecked(i) };
246 let mut hasher = seeded_state(hashes_buffer[i]).build_hasher();
247 value.hash_write(&mut hasher);
248 hashes_buffer[i] = hasher.finish();
249 }
250 } else {
251 for i in array.nulls().unwrap().valid_indices() {
252 let value = unsafe { array.value_unchecked(i) };
253 hashes_buffer[i] = value.hash_one(random_state);
254 }
255 }
256}
257
258#[cfg(not(feature = "force_hash_collisions"))]
262fn hash_array<T>(
263 array: &T,
264 random_state: &RandomState,
265 hashes_buffer: &mut [u64],
266 rehash: bool,
267) where
268 T: ArrayAccessor,
269 T::Item: HashValue,
270{
271 assert_eq!(
272 hashes_buffer.len(),
273 array.len(),
274 "hashes_buffer and array should be of equal length"
275 );
276
277 if array.null_count() == 0 {
278 if rehash {
279 for (i, hash) in hashes_buffer.iter_mut().enumerate() {
280 let value = unsafe { array.value_unchecked(i) };
281 *hash = combine_hashes(value.hash_one(random_state), *hash);
282 }
283 } else {
284 for (i, hash) in hashes_buffer.iter_mut().enumerate() {
285 let value = unsafe { array.value_unchecked(i) };
286 *hash = value.hash_one(random_state);
287 }
288 }
289 } else if rehash {
290 for i in array.nulls().unwrap().valid_indices() {
291 let value = unsafe { array.value_unchecked(i) };
292 hashes_buffer[i] =
293 combine_hashes(value.hash_one(random_state), hashes_buffer[i]);
294 }
295 } else {
296 for i in array.nulls().unwrap().valid_indices() {
297 let value = unsafe { array.value_unchecked(i) };
298 hashes_buffer[i] = value.hash_one(random_state);
299 }
300 }
301}
302
303#[cfg(not(feature = "force_hash_collisions"))]
311#[inline(never)]
312fn hash_string_view_array_inner<
313 T: ByteViewType,
314 const HAS_NULLS: bool,
315 const HAS_BUFFERS: bool,
316 const REHASH: bool,
317>(
318 array: &GenericByteViewArray<T>,
319 random_state: &RandomState,
320 hashes_buffer: &mut [u64],
321) {
322 assert_eq!(
323 hashes_buffer.len(),
324 array.len(),
325 "hashes_buffer and array should be of equal length"
326 );
327
328 let buffers = array.data_buffers();
329 let view_bytes = |view_len: u32, view: u128| {
330 let view = ByteView::from(view);
331 let offset = view.offset as usize;
332 unsafe {
334 let data = buffers.get_unchecked(view.buffer_index as usize);
335 data.get_unchecked(offset..offset + view_len as usize)
336 }
337 };
338
339 let hashes_and_views = hashes_buffer.iter_mut().zip(array.views().iter());
340 for (i, (hash, &v)) in hashes_and_views.enumerate() {
341 if HAS_NULLS && array.is_null(i) {
342 continue;
343 }
344 let view_len = v as u32;
345 if !HAS_BUFFERS || view_len <= 12 {
347 if REHASH {
348 let mut hasher = seeded_state(*hash).build_hasher();
349 v.hash_write(&mut hasher);
350 *hash = hasher.finish();
351 } else {
352 *hash = v.hash_one(random_state);
353 }
354 continue;
355 }
356 let value = view_bytes(view_len, v);
358 if REHASH {
359 let mut hasher = seeded_state(*hash).build_hasher();
360 value.hash_write(&mut hasher);
361 *hash = hasher.finish();
362 } else {
363 *hash = value.hash_one(random_state);
364 }
365 }
366}
367
368#[cfg(not(feature = "force_hash_collisions"))]
372fn hash_generic_byte_view_array<T: ByteViewType>(
373 array: &GenericByteViewArray<T>,
374 random_state: &RandomState,
375 hashes_buffer: &mut [u64],
376 rehash: bool,
377) {
378 match (
380 array.null_count() != 0,
381 !array.data_buffers().is_empty(),
382 rehash,
383 ) {
384 (false, false, false) => {
387 for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
388 *hash = view.hash_one(random_state);
389 }
390 }
391 (false, false, true) => {
392 for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
393 let mut hasher = seeded_state(*hash).build_hasher();
394 view.hash_write(&mut hasher);
395 *hash = hasher.finish();
396 }
397 }
398 (false, true, false) => hash_string_view_array_inner::<T, false, true, false>(
399 array,
400 random_state,
401 hashes_buffer,
402 ),
403 (false, true, true) => hash_string_view_array_inner::<T, false, true, true>(
404 array,
405 random_state,
406 hashes_buffer,
407 ),
408 (true, false, false) => hash_string_view_array_inner::<T, true, false, false>(
409 array,
410 random_state,
411 hashes_buffer,
412 ),
413 (true, false, true) => hash_string_view_array_inner::<T, true, false, true>(
414 array,
415 random_state,
416 hashes_buffer,
417 ),
418 (true, true, false) => hash_string_view_array_inner::<T, true, true, false>(
419 array,
420 random_state,
421 hashes_buffer,
422 ),
423 (true, true, true) => hash_string_view_array_inner::<T, true, true, true>(
424 array,
425 random_state,
426 hashes_buffer,
427 ),
428 }
429}
430
431#[cfg(not(feature = "force_hash_collisions"))]
438#[inline(never)]
439fn hash_dictionary_inner<
440 K: ArrowDictionaryKeyType,
441 const HAS_NULL_KEYS: bool,
442 const HAS_NULL_VALUES: bool,
443 const MULTI_COL: bool,
444>(
445 array: &DictionaryArray<K>,
446 random_state: &RandomState,
447 hashes_buffer: &mut [u64],
448) -> Result<()> {
449 let dict_values = array.values();
453 let mut dict_hashes = vec![0; dict_values.len()];
454 create_hashes([dict_values], random_state, &mut dict_hashes)?;
455
456 if HAS_NULL_KEYS {
457 for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
458 if let Some(key) = key {
459 let idx = key.as_usize();
460 if !HAS_NULL_VALUES || dict_values.is_valid(idx) {
461 if MULTI_COL {
462 *hash = combine_hashes(dict_hashes[idx], *hash);
463 } else {
464 *hash = dict_hashes[idx];
465 }
466 }
467 }
468 }
469 } else {
470 for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().values()) {
471 let idx = key.as_usize();
472 if !HAS_NULL_VALUES || dict_values.is_valid(idx) {
473 if MULTI_COL {
474 *hash = combine_hashes(dict_hashes[idx], *hash);
475 } else {
476 *hash = dict_hashes[idx];
477 }
478 }
479 }
480 }
481 Ok(())
482}
483
484#[cfg(not(feature = "force_hash_collisions"))]
486fn hash_dictionary<K: ArrowDictionaryKeyType>(
487 array: &DictionaryArray<K>,
488 random_state: &RandomState,
489 hashes_buffer: &mut [u64],
490 multi_col: bool,
491) -> Result<()> {
492 let has_null_keys = array.keys().null_count() != 0;
493 let has_null_values = array.values().null_count() != 0;
494
495 match (has_null_keys, has_null_values, multi_col) {
498 (false, false, false) => hash_dictionary_inner::<K, false, false, false>(
499 array,
500 random_state,
501 hashes_buffer,
502 ),
503 (false, false, true) => hash_dictionary_inner::<K, false, false, true>(
504 array,
505 random_state,
506 hashes_buffer,
507 ),
508 (false, true, false) => hash_dictionary_inner::<K, false, true, false>(
509 array,
510 random_state,
511 hashes_buffer,
512 ),
513 (false, true, true) => hash_dictionary_inner::<K, false, true, true>(
514 array,
515 random_state,
516 hashes_buffer,
517 ),
518 (true, false, false) => hash_dictionary_inner::<K, true, false, false>(
519 array,
520 random_state,
521 hashes_buffer,
522 ),
523 (true, false, true) => hash_dictionary_inner::<K, true, false, true>(
524 array,
525 random_state,
526 hashes_buffer,
527 ),
528 (true, true, false) => hash_dictionary_inner::<K, true, true, false>(
529 array,
530 random_state,
531 hashes_buffer,
532 ),
533 (true, true, true) => hash_dictionary_inner::<K, true, true, true>(
534 array,
535 random_state,
536 hashes_buffer,
537 ),
538 }
539}
540
541#[cfg(not(feature = "force_hash_collisions"))]
542fn hash_struct_array(
543 array: &StructArray,
544 random_state: &RandomState,
545 hashes_buffer: &mut [u64],
546) -> Result<()> {
547 let nulls = array.nulls();
548 let row_len = array.len();
549
550 let mut values_hashes = vec![0u64; row_len];
552 create_hashes(array.columns(), random_state, &mut values_hashes)?;
553
554 if let Some(nulls) = nulls {
556 for i in nulls.valid_indices() {
557 let hash = &mut hashes_buffer[i];
558 *hash = combine_hashes(*hash, values_hashes[i]);
559 }
560 } else {
561 for i in 0..row_len {
562 let hash = &mut hashes_buffer[i];
563 *hash = combine_hashes(*hash, values_hashes[i]);
564 }
565 }
566
567 Ok(())
568}
569
570#[cfg(not(feature = "force_hash_collisions"))]
572fn hash_map_array(
573 array: &MapArray,
574 random_state: &RandomState,
575 hashes_buffer: &mut [u64],
576) -> Result<()> {
577 let nulls = array.nulls();
578 let offsets = array.offsets();
579
580 let first_offset = offsets.first().copied().unwrap_or_default() as usize;
582 let last_offset = offsets.last().copied().unwrap_or_default() as usize;
583 let entries_len = last_offset - first_offset;
584
585 let mut values_hashes = vec![0u64; entries_len];
587 let entries = array.entries();
588 let sliced_columns: Vec<ArrayRef> = entries
589 .columns()
590 .iter()
591 .map(|col| col.slice(first_offset, entries_len))
592 .collect();
593 create_hashes(&sliced_columns, random_state, &mut values_hashes)?;
594
595 if let Some(nulls) = nulls {
598 for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
599 if nulls.is_valid(i) {
600 let hash = &mut hashes_buffer[i];
601 for values_hash in &values_hashes
602 [start.as_usize() - first_offset..stop.as_usize() - first_offset]
603 {
604 *hash = combine_hashes(*hash, *values_hash);
605 }
606 }
607 }
608 } else {
609 for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
610 let hash = &mut hashes_buffer[i];
611 for values_hash in &values_hashes
612 [start.as_usize() - first_offset..stop.as_usize() - first_offset]
613 {
614 *hash = combine_hashes(*hash, *values_hash);
615 }
616 }
617 }
618
619 Ok(())
620}
621
622#[cfg(not(feature = "force_hash_collisions"))]
623fn hash_list_array<OffsetSize>(
624 array: &GenericListArray<OffsetSize>,
625 random_state: &RandomState,
626 hashes_buffer: &mut [u64],
627) -> Result<()>
628where
629 OffsetSize: OffsetSizeTrait,
630{
631 let first_offset = array.value_offsets().first().cloned().unwrap_or_default();
633 let last_offset = array.value_offsets().last().cloned().unwrap_or_default();
634 let value_bytes_len = (last_offset - first_offset).as_usize();
635 let mut values_hashes = vec![0u64; value_bytes_len];
636 create_hashes(
637 [array
638 .values()
639 .slice(first_offset.as_usize(), value_bytes_len)],
640 random_state,
641 &mut values_hashes,
642 )?;
643
644 if array.null_count() > 0 {
645 for (i, (start, stop)) in array.value_offsets().iter().tuple_windows().enumerate()
646 {
647 if array.is_valid(i) {
648 let hash = &mut hashes_buffer[i];
649 for values_hash in &values_hashes[(*start - first_offset).as_usize()
650 ..(*stop - first_offset).as_usize()]
651 {
652 *hash = combine_hashes(*hash, *values_hash);
653 }
654 }
655 }
656 } else {
657 for ((start, stop), hash) in array
658 .value_offsets()
659 .iter()
660 .tuple_windows()
661 .zip(hashes_buffer.iter_mut())
662 {
663 for values_hash in &values_hashes
664 [(*start - first_offset).as_usize()..(*stop - first_offset).as_usize()]
665 {
666 *hash = combine_hashes(*hash, *values_hash);
667 }
668 }
669 }
670 Ok(())
671}
672
673#[cfg(not(feature = "force_hash_collisions"))]
674fn hash_list_view_array<OffsetSize>(
675 array: &GenericListViewArray<OffsetSize>,
676 random_state: &RandomState,
677 hashes_buffer: &mut [u64],
678) -> Result<()>
679where
680 OffsetSize: OffsetSizeTrait,
681{
682 let values = array.values();
683 let offsets = array.value_offsets();
684 let sizes = array.value_sizes();
685 let nulls = array.nulls();
686 let mut values_hashes = vec![0u64; values.len()];
687 create_hashes([values], random_state, &mut values_hashes)?;
688 if let Some(nulls) = nulls {
689 for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
690 if nulls.is_valid(i) {
691 let hash = &mut hashes_buffer[i];
692 let start = offset.as_usize();
693 let end = start + size.as_usize();
694 for values_hash in &values_hashes[start..end] {
695 *hash = combine_hashes(*hash, *values_hash);
696 }
697 }
698 }
699 } else {
700 for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
701 let hash = &mut hashes_buffer[i];
702 let start = offset.as_usize();
703 let end = start + size.as_usize();
704 for values_hash in &values_hashes[start..end] {
705 *hash = combine_hashes(*hash, *values_hash);
706 }
707 }
708 }
709 Ok(())
710}
711
712#[cfg(not(feature = "force_hash_collisions"))]
713fn hash_union_array(
714 array: &UnionArray,
715 random_state: &RandomState,
716 hashes_buffer: &mut [u64],
717) -> Result<()> {
718 let DataType::Union(union_fields, _mode) = array.data_type() else {
719 unreachable!()
720 };
721
722 if array.is_dense() {
723 hash_union_array_default(array, union_fields, random_state, hashes_buffer)
726 } else {
727 hash_sparse_union_array(array, union_fields, random_state, hashes_buffer)
731 }
732}
733
734#[cfg(not(feature = "force_hash_collisions"))]
744fn hash_union_array_default(
745 array: &UnionArray,
746 union_fields: &UnionFields,
747 random_state: &RandomState,
748 hashes_buffer: &mut [u64],
749) -> Result<()> {
750 let mut child_hashes: HashMap<i8, Vec<u64>> =
751 HashMap::with_capacity(union_fields.len());
752
753 for (type_id, _field) in union_fields.iter() {
755 let child = array.child(type_id);
756 let mut child_hash_buffer = vec![0; child.len()];
757 create_hashes([child], random_state, &mut child_hash_buffer)?;
758
759 child_hashes.insert(type_id, child_hash_buffer);
760 }
761
762 #[expect(clippy::needless_range_loop)]
766 for i in 0..array.len() {
767 let type_id = array.type_id(i);
768 let child_offset = array.value_offset(i);
769
770 let child_hash = child_hashes.get(&type_id).expect("invalid type_id");
771 hashes_buffer[i] = combine_hashes(hashes_buffer[i], child_hash[child_offset]);
772 }
773
774 Ok(())
775}
776
777#[cfg(not(feature = "force_hash_collisions"))]
785fn hash_sparse_union_array(
786 array: &UnionArray,
787 union_fields: &UnionFields,
788 random_state: &RandomState,
789 hashes_buffer: &mut [u64],
790) -> Result<()> {
791 use std::collections::HashMap;
792
793 if union_fields.len() <= 2 {
796 return hash_union_array_default(
797 array,
798 union_fields,
799 random_state,
800 hashes_buffer,
801 );
802 }
803
804 let type_ids = array.type_ids();
805
806 let mut indices_by_type: HashMap<i8, Vec<u32>> = HashMap::new();
808 for (i, &type_id) in type_ids.iter().enumerate() {
809 indices_by_type.entry(type_id).or_default().push(i as u32);
810 }
811
812 for (type_id, _field) in union_fields.iter() {
814 if let Some(indices) = indices_by_type.get(&type_id) {
815 if indices.is_empty() {
816 continue;
817 }
818
819 let child = array.child(type_id);
820 let indices_array = UInt32Array::from(indices.clone());
821
822 let filtered = take(child.as_ref(), &indices_array, None)?;
824
825 let mut filtered_hashes = vec![0u64; filtered.len()];
827 create_hashes([&filtered], random_state, &mut filtered_hashes)?;
828
829 for (hash, &idx) in filtered_hashes.iter().zip(indices.iter()) {
831 hashes_buffer[idx as usize] =
832 combine_hashes(hashes_buffer[idx as usize], *hash);
833 }
834 }
835 }
836
837 Ok(())
838}
839
840#[cfg(not(feature = "force_hash_collisions"))]
841fn hash_fixed_list_array(
842 array: &FixedSizeListArray,
843 random_state: &RandomState,
844 hashes_buffer: &mut [u64],
845) -> Result<()> {
846 let values = array.values();
847 let value_length = array.value_length() as usize;
848 let nulls = array.nulls();
849 let mut values_hashes = vec![0u64; values.len()];
850 create_hashes([values], random_state, &mut values_hashes)?;
851 if let Some(nulls) = nulls {
852 for i in 0..array.len() {
853 if nulls.is_valid(i) {
854 let hash = &mut hashes_buffer[i];
855 for values_hash in
856 &values_hashes[i * value_length..(i + 1) * value_length]
857 {
858 *hash = combine_hashes(*hash, *values_hash);
859 }
860 }
861 }
862 } else {
863 for i in 0..array.len() {
864 let hash = &mut hashes_buffer[i];
865 for values_hash in &values_hashes[i * value_length..(i + 1) * value_length] {
866 *hash = combine_hashes(*hash, *values_hash);
867 }
868 }
869 }
870 Ok(())
871}
872
873#[inline(never)]
875#[cfg(not(feature = "force_hash_collisions"))]
876fn hash_run_array_inner<
877 R: RunEndIndexType,
878 const HAS_NULL_VALUES: bool,
879 const REHASH: bool,
880>(
881 array: &RunArray<R>,
882 random_state: &RandomState,
883 hashes_buffer: &mut [u64],
884) -> Result<()> {
885 let array_offset = array.offset();
889 let array_len = array.len();
890
891 if array_len == 0 {
892 return Ok(());
893 }
894
895 let run_ends = array.run_ends();
896 let run_ends_values = run_ends.values();
897 let values = array.values();
898
899 let start_physical_index = array.get_start_physical_index();
900 let end_physical_index = array.get_end_physical_index() + 1;
903
904 let sliced_values = values.slice(
905 start_physical_index,
906 end_physical_index - start_physical_index,
907 );
908 let mut values_hashes = vec![0u64; sliced_values.len()];
909 create_hashes(
910 std::slice::from_ref(&sliced_values),
911 random_state,
912 &mut values_hashes,
913 )?;
914
915 let mut start_in_slice = 0;
916 for (adjusted_physical_index, &absolute_run_end) in run_ends_values
917 [start_physical_index..end_physical_index]
918 .iter()
919 .enumerate()
920 {
921 let absolute_run_end = absolute_run_end.as_usize();
922 let end_in_slice = (absolute_run_end - array_offset).min(array_len);
923
924 if HAS_NULL_VALUES && sliced_values.is_null(adjusted_physical_index) {
925 start_in_slice = end_in_slice;
926 continue;
927 }
928
929 let value_hash = values_hashes[adjusted_physical_index];
930 let run_slice = &mut hashes_buffer[start_in_slice..end_in_slice];
931
932 if REHASH {
933 for hash in run_slice.iter_mut() {
934 *hash = combine_hashes(value_hash, *hash);
935 }
936 } else {
937 run_slice.fill(value_hash);
938 }
939
940 start_in_slice = end_in_slice;
941 }
942
943 Ok(())
944}
945
946#[cfg(not(feature = "force_hash_collisions"))]
947fn hash_run_array<R: RunEndIndexType>(
948 array: &RunArray<R>,
949 random_state: &RandomState,
950 hashes_buffer: &mut [u64],
951 rehash: bool,
952) -> Result<()> {
953 let has_null_values = array.values().null_count() != 0;
954
955 match (has_null_values, rehash) {
956 (false, false) => {
957 hash_run_array_inner::<R, false, false>(array, random_state, hashes_buffer)
958 }
959 (false, true) => {
960 hash_run_array_inner::<R, false, true>(array, random_state, hashes_buffer)
961 }
962 (true, false) => {
963 hash_run_array_inner::<R, true, false>(array, random_state, hashes_buffer)
964 }
965 (true, true) => {
966 hash_run_array_inner::<R, true, true>(array, random_state, hashes_buffer)
967 }
968 }
969}
970
971#[cfg(not(feature = "force_hash_collisions"))]
974fn hash_single_array(
975 array: &dyn Array,
976 random_state: &RandomState,
977 hashes_buffer: &mut [u64],
978 rehash: bool,
979) -> Result<()> {
980 downcast_primitive_array! {
981 array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
982 DataType::Null => hash_null(random_state, hashes_buffer, rehash),
983 DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash),
984 DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash),
985 DataType::Utf8View => hash_generic_byte_view_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
986 DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash),
987 DataType::Binary => hash_array(&as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
988 DataType::BinaryView => hash_generic_byte_view_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
989 DataType::LargeBinary => hash_array(&as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
990 DataType::FixedSizeBinary(_) => {
991 let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
992 hash_array(&array, random_state, hashes_buffer, rehash)
993 }
994 DataType::Dictionary(_, _) => downcast_dictionary_array! {
995 array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
996 _ => unreachable!()
997 }
998 DataType::Struct(_) => {
999 let array = as_struct_array(array)?;
1000 hash_struct_array(array, random_state, hashes_buffer)?;
1001 }
1002 DataType::List(_) => {
1003 let array = as_list_array(array)?;
1004 hash_list_array(array, random_state, hashes_buffer)?;
1005 }
1006 DataType::LargeList(_) => {
1007 let array = as_large_list_array(array)?;
1008 hash_list_array(array, random_state, hashes_buffer)?;
1009 }
1010 DataType::ListView(_) => {
1011 let array = as_list_view_array(array)?;
1012 hash_list_view_array(array, random_state, hashes_buffer)?;
1013 }
1014 DataType::LargeListView(_) => {
1015 let array = as_large_list_view_array(array)?;
1016 hash_list_view_array(array, random_state, hashes_buffer)?;
1017 }
1018 DataType::Map(_, _) => {
1019 let array = as_map_array(array)?;
1020 hash_map_array(array, random_state, hashes_buffer)?;
1021 }
1022 DataType::FixedSizeList(_,_) => {
1023 let array = as_fixed_size_list_array(array)?;
1024 hash_fixed_list_array(array, random_state, hashes_buffer)?;
1025 }
1026 DataType::Union(_, _) => {
1027 let array = as_union_array(array)?;
1028 hash_union_array(array, random_state, hashes_buffer)?;
1029 }
1030 DataType::RunEndEncoded(_, _) => downcast_run_array! {
1031 array => hash_run_array(array, random_state, hashes_buffer, rehash)?,
1032 _ => unreachable!()
1033 }
1034 _ => {
1035 return _internal_err!(
1037 "Unsupported data type in hasher: {}",
1038 array.data_type()
1039 );
1040 }
1041 }
1042 Ok(())
1043}
1044
1045#[cfg(feature = "force_hash_collisions")]
1047fn hash_single_array(
1048 _array: &dyn Array,
1049 _random_state: &RandomState,
1050 hashes_buffer: &mut [u64],
1051 _rehash: bool,
1052) -> Result<()> {
1053 for hash in hashes_buffer.iter_mut() {
1054 *hash = 0
1055 }
1056 Ok(())
1057}
1058
1059pub trait AsDynArray {
1069 fn as_dyn_array(&self) -> &dyn Array;
1070}
1071
1072impl AsDynArray for dyn Array {
1073 fn as_dyn_array(&self) -> &dyn Array {
1074 self
1075 }
1076}
1077
1078impl AsDynArray for &dyn Array {
1079 fn as_dyn_array(&self) -> &dyn Array {
1080 *self
1081 }
1082}
1083
1084impl AsDynArray for ArrayRef {
1085 fn as_dyn_array(&self) -> &dyn Array {
1086 self.as_ref()
1087 }
1088}
1089
1090impl AsDynArray for &ArrayRef {
1091 fn as_dyn_array(&self) -> &dyn Array {
1092 self.as_ref()
1093 }
1094}
1095
1096pub fn create_hashes<'a, I, T>(
1101 arrays: I,
1102 random_state: &RandomState,
1103 hashes_buffer: &'a mut [u64],
1104) -> Result<&'a mut [u64]>
1105where
1106 I: IntoIterator<Item = T>,
1107 T: AsDynArray,
1108{
1109 for (i, array) in arrays.into_iter().enumerate() {
1110 let rehash = i >= 1;
1112 hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?;
1113 }
1114 Ok(hashes_buffer)
1115}
1116
1117#[cfg(test)]
1118mod tests {
1119 use std::sync::Arc;
1120
1121 use arrow::array::*;
1122 #[cfg(not(feature = "force_hash_collisions"))]
1123 use arrow::datatypes::*;
1124
1125 use super::*;
1126
1127 #[test]
1128 fn create_hashes_for_decimal_array() -> Result<()> {
1129 let array = vec![1, 2, 3, 4]
1130 .into_iter()
1131 .map(Some)
1132 .collect::<Decimal128Array>()
1133 .with_precision_and_scale(20, 3)
1134 .unwrap();
1135 let array_ref: ArrayRef = Arc::new(array);
1136 let random_state = RandomState::with_seed(0);
1137 let hashes_buff = &mut vec![0; array_ref.len()];
1138 let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
1139 assert_eq!(hashes.len(), 4);
1140 Ok(())
1141 }
1142
1143 #[test]
1144 fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
1145 let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
1146 let random_state = RandomState::with_seed(0);
1147 let hashes_buff = &mut [0; 0];
1148 let hashes = create_hashes(
1149 &[Arc::new(empty_array) as ArrayRef],
1150 &random_state,
1151 hashes_buff,
1152 )?;
1153 assert_eq!(hashes, &Vec::<u64>::new());
1154 Ok(())
1155 }
1156
1157 #[test]
1158 fn create_hashes_for_float_arrays() -> Result<()> {
1159 let f32_arr: ArrayRef =
1160 Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
1161 let f64_arr: ArrayRef =
1162 Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
1163
1164 let random_state = RandomState::with_seed(0);
1165 let hashes_buff = &mut vec![0; f32_arr.len()];
1166 let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
1167 assert_eq!(hashes.len(), 4,);
1168
1169 let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?;
1170 assert_eq!(hashes.len(), 4,);
1171
1172 Ok(())
1173 }
1174
1175 macro_rules! create_hash_binary {
1176 ($NAME:ident, $ARRAY:ty) => {
1177 #[cfg(not(feature = "force_hash_collisions"))]
1178 #[test]
1179 fn $NAME() {
1180 let binary = [
1181 Some(b"short".to_byte_slice()),
1182 None,
1183 Some(b"long but different 12 bytes string"),
1184 Some(b"short2"),
1185 Some(b"Longer than 12 bytes string"),
1186 Some(b"short"),
1187 Some(b"Longer than 12 bytes string"),
1188 ];
1189
1190 let binary_array: ArrayRef =
1191 Arc::new(binary.iter().cloned().collect::<$ARRAY>());
1192
1193 let random_state = RandomState::with_seed(0);
1194
1195 let mut binary_hashes = vec![0; binary.len()];
1196 create_hashes(&[binary_array], &random_state, &mut binary_hashes)
1197 .unwrap();
1198
1199 for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
1201 match val {
1202 Some(_) => assert_ne!(*hash, 0),
1203 None => assert_eq!(*hash, 0),
1204 }
1205 }
1206
1207 assert_eq!(binary[0], binary[5]);
1209 assert_eq!(binary[4], binary[6]);
1210
1211 assert_ne!(binary[0], binary[2]);
1213 }
1214 };
1215 }
1216
1217 create_hash_binary!(binary_array, BinaryArray);
1218 create_hash_binary!(large_binary_array, LargeBinaryArray);
1219 create_hash_binary!(binary_view_array, BinaryViewArray);
1220
1221 #[test]
1222 fn create_hashes_fixed_size_binary() -> Result<()> {
1223 let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
1224 let fixed_size_binary_array: ArrayRef =
1225 Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
1226
1227 let random_state = RandomState::with_seed(0);
1228 let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
1229 let hashes =
1230 create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
1231 assert_eq!(hashes.len(), 3,);
1232
1233 Ok(())
1234 }
1235
1236 macro_rules! create_hash_string {
1237 ($NAME:ident, $ARRAY:ty) => {
1238 #[cfg(not(feature = "force_hash_collisions"))]
1239 #[test]
1240 fn $NAME() {
1241 let strings = [
1242 Some("short"),
1243 None,
1244 Some("long but different 12 bytes string"),
1245 Some("short2"),
1246 Some("Longer than 12 bytes string"),
1247 Some("short"),
1248 Some("Longer than 12 bytes string"),
1249 ];
1250
1251 let string_array: ArrayRef =
1252 Arc::new(strings.iter().cloned().collect::<$ARRAY>());
1253 let dict_array: ArrayRef = Arc::new(
1254 strings
1255 .iter()
1256 .cloned()
1257 .collect::<DictionaryArray<Int8Type>>(),
1258 );
1259
1260 let random_state = RandomState::with_seed(0);
1261
1262 let mut string_hashes = vec![0; strings.len()];
1263 create_hashes(&[string_array], &random_state, &mut string_hashes)
1264 .unwrap();
1265
1266 let mut dict_hashes = vec![0; strings.len()];
1267 create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
1268
1269 for (val, hash) in strings.iter().zip(string_hashes.iter()) {
1271 match val {
1272 Some(_) => assert_ne!(*hash, 0),
1273 None => assert_eq!(*hash, 0),
1274 }
1275 }
1276
1277 assert_eq!(string_hashes, dict_hashes);
1279
1280 assert_eq!(strings[0], strings[5]);
1282 assert_eq!(strings[4], strings[6]);
1283
1284 assert_ne!(strings[0], strings[2]);
1286 }
1287 };
1288 }
1289
1290 create_hash_string!(string_array, StringArray);
1291 create_hash_string!(large_string_array, LargeStringArray);
1292 create_hash_string!(string_view_array, StringArray);
1293 create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
1294
1295 #[test]
1296 #[cfg(not(feature = "force_hash_collisions"))]
1297 fn create_hashes_for_run_array() -> Result<()> {
1298 let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
1299 let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
1300 let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
1301
1302 let random_state = RandomState::with_seed(0);
1303 let hashes_buff = &mut vec![0; array.len()];
1304 let hashes = create_hashes(
1305 &[Arc::clone(&array) as ArrayRef],
1306 &random_state,
1307 hashes_buff,
1308 )?;
1309
1310 assert_eq!(hashes.len(), 7);
1311 assert_eq!(hashes[0], hashes[1]);
1312 assert_eq!(hashes[2], hashes[3]);
1313 assert_eq!(hashes[3], hashes[4]);
1314 assert_eq!(hashes[5], hashes[6]);
1315 assert_ne!(hashes[0], hashes[2]);
1316 assert_ne!(hashes[2], hashes[5]);
1317 assert_ne!(hashes[0], hashes[5]);
1318
1319 Ok(())
1320 }
1321
1322 #[test]
1323 #[cfg(not(feature = "force_hash_collisions"))]
1324 fn create_multi_column_hash_with_run_array() -> Result<()> {
1325 let int_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7]));
1326 let values = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
1327 let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
1328 let run_array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
1329
1330 let random_state = RandomState::with_seed(0);
1331 let mut one_col_hashes = vec![0; int_array.len()];
1332 create_hashes(
1333 &[Arc::clone(&int_array) as ArrayRef],
1334 &random_state,
1335 &mut one_col_hashes,
1336 )?;
1337
1338 let mut two_col_hashes = vec![0; int_array.len()];
1339 create_hashes(
1340 &[
1341 Arc::clone(&int_array) as ArrayRef,
1342 Arc::clone(&run_array) as ArrayRef,
1343 ],
1344 &random_state,
1345 &mut two_col_hashes,
1346 )?;
1347
1348 assert_eq!(one_col_hashes.len(), 7);
1349 assert_eq!(two_col_hashes.len(), 7);
1350 assert_ne!(one_col_hashes, two_col_hashes);
1351
1352 let diff_0_vs_1_one_col = one_col_hashes[0] != one_col_hashes[1];
1353 let diff_0_vs_1_two_col = two_col_hashes[0] != two_col_hashes[1];
1354 assert_eq!(diff_0_vs_1_one_col, diff_0_vs_1_two_col);
1355
1356 let diff_2_vs_3_one_col = one_col_hashes[2] != one_col_hashes[3];
1357 let diff_2_vs_3_two_col = two_col_hashes[2] != two_col_hashes[3];
1358 assert_eq!(diff_2_vs_3_one_col, diff_2_vs_3_two_col);
1359
1360 Ok(())
1361 }
1362
1363 #[test]
1364 #[cfg(not(feature = "force_hash_collisions"))]
1366 fn create_hashes_for_dict_arrays() {
1367 let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
1368
1369 let string_array: ArrayRef =
1370 Arc::new(strings.iter().cloned().collect::<StringArray>());
1371 let dict_array: ArrayRef = Arc::new(
1372 strings
1373 .iter()
1374 .cloned()
1375 .collect::<DictionaryArray<Int8Type>>(),
1376 );
1377
1378 let random_state = RandomState::with_seed(0);
1379
1380 let mut string_hashes = vec![0; strings.len()];
1381 create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
1382
1383 let mut dict_hashes = vec![0; strings.len()];
1384 create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
1385
1386 for (val, hash) in strings.iter().zip(string_hashes.iter()) {
1388 match val {
1389 Some(_) => assert_ne!(*hash, 0),
1390 None => assert_eq!(*hash, 0),
1391 }
1392 }
1393
1394 assert_eq!(string_hashes, dict_hashes);
1396
1397 assert_eq!(strings[1], strings[4]);
1399 assert_eq!(dict_hashes[1], dict_hashes[4]);
1400 assert_eq!(strings[0], strings[3]);
1401 assert_eq!(dict_hashes[0], dict_hashes[3]);
1402
1403 assert_ne!(strings[0], strings[2]);
1405 assert_ne!(dict_hashes[0], dict_hashes[2]);
1406 }
1407
1408 #[test]
1409 #[cfg(not(feature = "force_hash_collisions"))]
1411 fn create_hashes_for_list_arrays() {
1412 let data = vec![
1413 Some(vec![Some(0), Some(1), Some(2)]),
1414 None,
1415 Some(vec![Some(3), None, Some(5)]),
1416 Some(vec![Some(3), None, Some(5)]),
1417 None,
1418 Some(vec![Some(0), Some(1), Some(2)]),
1419 Some(vec![]),
1420 ];
1421 let list_array =
1422 Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
1423 let random_state = RandomState::with_seed(0);
1424 let mut hashes = vec![0; list_array.len()];
1425 create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
1426 assert_eq!(hashes[0], hashes[5]);
1427 assert_eq!(hashes[1], hashes[4]);
1428 assert_eq!(hashes[2], hashes[3]);
1429 assert_eq!(hashes[1], hashes[6]); }
1431
1432 #[test]
1433 #[cfg(not(feature = "force_hash_collisions"))]
1434 fn create_hashes_for_sliced_list_arrays() {
1435 let data = vec![
1436 Some(vec![Some(0), Some(1), Some(2)]),
1437 None,
1438 Some(vec![Some(3), None, Some(5)]),
1440 Some(vec![Some(3), None, Some(5)]),
1441 None,
1442 Some(vec![Some(0), Some(1), Some(2)]),
1444 Some(vec![]),
1445 ];
1446 let list_array =
1447 Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
1448 let list_array = list_array.slice(2, 3);
1449 let random_state = RandomState::with_seed(0);
1450 let mut hashes = vec![0; list_array.len()];
1451 create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
1452 assert_eq!(hashes[0], hashes[1]);
1453 assert_ne!(hashes[1], hashes[2]);
1454 }
1455
1456 #[test]
1457 #[cfg(not(feature = "force_hash_collisions"))]
1459 fn create_hashes_for_list_view_arrays() {
1460 use arrow::buffer::{NullBuffer, ScalarBuffer};
1461
1462 let values = Arc::new(Int32Array::from(vec![
1464 Some(0),
1465 Some(1),
1466 Some(2),
1467 Some(3),
1468 None,
1469 Some(5),
1470 ])) as ArrayRef;
1471 let field = Arc::new(Field::new("item", DataType::Int32, true));
1472
1473 let offsets = ScalarBuffer::from(vec![0i32, 0, 3, 3, 0, 0, 0]);
1482 let sizes = ScalarBuffer::from(vec![3i32, 0, 3, 3, 0, 3, 0]);
1483 let nulls = Some(NullBuffer::from(vec![
1484 true, false, true, true, false, true, true,
1485 ]));
1486
1487 let list_view_array =
1488 Arc::new(ListViewArray::new(field, offsets, sizes, values, nulls))
1489 as ArrayRef;
1490
1491 let random_state = RandomState::with_seed(0);
1492 let mut hashes = vec![0; list_view_array.len()];
1493 create_hashes(&[list_view_array], &random_state, &mut hashes).unwrap();
1494
1495 assert_eq!(hashes[0], hashes[5]); assert_eq!(hashes[1], hashes[4]); assert_eq!(hashes[2], hashes[3]); assert_eq!(hashes[1], hashes[6]); assert_ne!(hashes[0], hashes[2]); assert_ne!(hashes[0], hashes[6]); assert_ne!(hashes[2], hashes[6]); }
1505
1506 #[test]
1507 #[cfg(not(feature = "force_hash_collisions"))]
1509 fn create_hashes_for_large_list_view_arrays() {
1510 use arrow::buffer::{NullBuffer, ScalarBuffer};
1511
1512 let values = Arc::new(Int32Array::from(vec![
1514 Some(0),
1515 Some(1),
1516 Some(2),
1517 Some(3),
1518 None,
1519 Some(5),
1520 ])) as ArrayRef;
1521 let field = Arc::new(Field::new("item", DataType::Int32, true));
1522
1523 let offsets = ScalarBuffer::from(vec![0i64, 0, 3, 3, 0, 0, 0]);
1532 let sizes = ScalarBuffer::from(vec![3i64, 0, 3, 3, 0, 3, 0]);
1533 let nulls = Some(NullBuffer::from(vec![
1534 true, false, true, true, false, true, true,
1535 ]));
1536
1537 let large_list_view_array = Arc::new(LargeListViewArray::new(
1538 field, offsets, sizes, values, nulls,
1539 )) as ArrayRef;
1540
1541 let random_state = RandomState::with_seed(0);
1542 let mut hashes = vec![0; large_list_view_array.len()];
1543 create_hashes(&[large_list_view_array], &random_state, &mut hashes).unwrap();
1544
1545 assert_eq!(hashes[0], hashes[5]); assert_eq!(hashes[1], hashes[4]); assert_eq!(hashes[2], hashes[3]); assert_eq!(hashes[1], hashes[6]); assert_ne!(hashes[0], hashes[2]); assert_ne!(hashes[0], hashes[6]); assert_ne!(hashes[2], hashes[6]); }
1555
1556 #[test]
1557 #[cfg(not(feature = "force_hash_collisions"))]
1559 fn create_hashes_for_fixed_size_list_arrays() {
1560 let data = vec![
1561 Some(vec![Some(0), Some(1), Some(2)]),
1562 None,
1563 Some(vec![Some(3), None, Some(5)]),
1564 Some(vec![Some(3), None, Some(5)]),
1565 None,
1566 Some(vec![Some(0), Some(1), Some(2)]),
1567 ];
1568 let list_array =
1569 Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
1570 data, 3,
1571 )) as ArrayRef;
1572 let random_state = RandomState::with_seed(0);
1573 let mut hashes = vec![0; list_array.len()];
1574 create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
1575 assert_eq!(hashes[0], hashes[5]);
1576 assert_eq!(hashes[1], hashes[4]);
1577 assert_eq!(hashes[2], hashes[3]);
1578 }
1579
1580 #[test]
1581 #[cfg(not(feature = "force_hash_collisions"))]
1583 fn create_hashes_for_struct_arrays() {
1584 use arrow::buffer::Buffer;
1585
1586 let boolarr = Arc::new(BooleanArray::from(vec![
1587 false, false, true, true, true, true,
1588 ]));
1589 let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31]));
1590
1591 let struct_array = StructArray::from((
1592 vec![
1593 (
1594 Arc::new(Field::new("bool", DataType::Boolean, false)),
1595 Arc::clone(&boolarr) as ArrayRef,
1596 ),
1597 (
1598 Arc::new(Field::new("i32", DataType::Int32, false)),
1599 Arc::clone(&i32arr) as ArrayRef,
1600 ),
1601 (
1602 Arc::new(Field::new("i32", DataType::Int32, false)),
1603 Arc::clone(&i32arr) as ArrayRef,
1604 ),
1605 (
1606 Arc::new(Field::new("bool", DataType::Boolean, false)),
1607 Arc::clone(&boolarr) as ArrayRef,
1608 ),
1609 ],
1610 Buffer::from(&[0b001011]),
1611 ));
1612
1613 assert!(struct_array.is_valid(0));
1614 assert!(struct_array.is_valid(1));
1615 assert!(struct_array.is_null(2));
1616 assert!(struct_array.is_valid(3));
1617 assert!(struct_array.is_null(4));
1618 assert!(struct_array.is_null(5));
1619
1620 let array = Arc::new(struct_array) as ArrayRef;
1621
1622 let random_state = RandomState::with_seed(0);
1623 let mut hashes = vec![0; array.len()];
1624 create_hashes(&[array], &random_state, &mut hashes).unwrap();
1625 assert_eq!(hashes[0], hashes[1]);
1626 assert_ne!(hashes[2], hashes[3]);
1628 assert_eq!(hashes[4], hashes[5]);
1630 }
1631
1632 #[test]
1633 #[cfg(not(feature = "force_hash_collisions"))]
1635 fn create_hashes_for_struct_arrays_more_column_than_row() {
1636 let struct_array = StructArray::from(vec![
1637 (
1638 Arc::new(Field::new("bool", DataType::Boolean, false)),
1639 Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
1640 ),
1641 (
1642 Arc::new(Field::new("i32-1", DataType::Int32, false)),
1643 Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
1644 ),
1645 (
1646 Arc::new(Field::new("i32-2", DataType::Int32, false)),
1647 Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
1648 ),
1649 (
1650 Arc::new(Field::new("i32-3", DataType::Int32, false)),
1651 Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
1652 ),
1653 ]);
1654
1655 assert!(struct_array.is_valid(0));
1656 assert!(struct_array.is_valid(1));
1657
1658 let array = Arc::new(struct_array) as ArrayRef;
1659 let random_state = RandomState::with_seed(0);
1660 let mut hashes = vec![0; array.len()];
1661 create_hashes(&[array], &random_state, &mut hashes).unwrap();
1662 assert_eq!(hashes[0], hashes[1]);
1663 }
1664
1665 #[test]
1666 #[cfg(not(feature = "force_hash_collisions"))]
1668 fn create_hashes_for_map_arrays() {
1669 let mut builder =
1670 MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
1671 builder.keys().append_value("key1");
1673 builder.keys().append_value("key2");
1674 builder.values().append_value(1);
1675 builder.values().append_value(2);
1676 builder.append(true).unwrap();
1677 builder.keys().append_value("key1");
1679 builder.keys().append_value("key2");
1680 builder.values().append_value(1);
1681 builder.values().append_value(2);
1682 builder.append(true).unwrap();
1683 builder.keys().append_value("key1");
1685 builder.keys().append_value("key2");
1686 builder.values().append_value(1);
1687 builder.values().append_value(3);
1688 builder.append(true).unwrap();
1689 builder.keys().append_value("key1");
1691 builder.keys().append_value("key3");
1692 builder.values().append_value(1);
1693 builder.values().append_value(2);
1694 builder.append(true).unwrap();
1695 builder.keys().append_value("key1");
1697 builder.values().append_value(1);
1698 builder.append(true).unwrap();
1699 builder.keys().append_value("key1");
1701 builder.values().append_null();
1702 builder.append(true).unwrap();
1703 builder.append(true).unwrap();
1705 builder.keys().append_value("key1");
1707 builder.values().append_value(1);
1708 builder.append(false).unwrap();
1709
1710 let array = Arc::new(builder.finish()) as ArrayRef;
1711
1712 let random_state = RandomState::with_seed(0);
1713 let mut hashes = vec![0; array.len()];
1714 create_hashes(&[array], &random_state, &mut hashes).unwrap();
1715 assert_eq!(hashes[0], hashes[1]); assert_ne!(hashes[0], hashes[2]); assert_ne!(hashes[0], hashes[3]); assert_ne!(hashes[0], hashes[4]); assert_ne!(hashes[4], hashes[5]); assert_eq!(hashes[6], hashes[7]); }
1722
1723 #[test]
1724 #[cfg(not(feature = "force_hash_collisions"))]
1726 fn create_multi_column_hash_for_dict_arrays() {
1727 let strings1 = [Some("foo"), None, Some("bar")];
1728 let strings2 = [Some("blarg"), Some("blah"), None];
1729
1730 let string_array: ArrayRef =
1731 Arc::new(strings1.iter().cloned().collect::<StringArray>());
1732 let dict_array: ArrayRef = Arc::new(
1733 strings2
1734 .iter()
1735 .cloned()
1736 .collect::<DictionaryArray<Int32Type>>(),
1737 );
1738
1739 let random_state = RandomState::with_seed(0);
1740
1741 let mut one_col_hashes = vec![0; strings1.len()];
1742 create_hashes(
1743 &[Arc::clone(&dict_array) as ArrayRef],
1744 &random_state,
1745 &mut one_col_hashes,
1746 )
1747 .unwrap();
1748
1749 let mut two_col_hashes = vec![0; strings1.len()];
1750 create_hashes(
1751 &[dict_array, string_array],
1752 &random_state,
1753 &mut two_col_hashes,
1754 )
1755 .unwrap();
1756
1757 assert_eq!(one_col_hashes.len(), 3);
1758 assert_eq!(two_col_hashes.len(), 3);
1759
1760 assert_ne!(one_col_hashes, two_col_hashes);
1761 }
1762
1763 #[test]
1764 fn test_create_hashes_from_arrays() {
1765 let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
1766 let float_array: ArrayRef =
1767 Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
1768
1769 let random_state = RandomState::with_seed(0);
1770 let hashes_buff = &mut vec![0; int_array.len()];
1771 let hashes =
1772 create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap();
1773 assert_eq!(hashes.len(), 4,);
1774 }
1775
1776 #[test]
1777 fn test_create_hashes_from_dyn_arrays() {
1778 let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
1779 let float_array: ArrayRef =
1780 Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
1781
1782 fn test(arr1: &dyn Array, arr2: &dyn Array) {
1784 let random_state = RandomState::with_seed(0);
1785 let hashes_buff = &mut vec![0; arr1.len()];
1786 let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap();
1787 assert_eq!(hashes.len(), 4,);
1788 }
1789 test(&*int_array, &*float_array);
1790 }
1791
1792 #[test]
1793 fn test_create_hashes_equivalence() {
1794 let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
1795 let random_state = RandomState::with_seed(0);
1796
1797 let mut hashes1 = vec![0; array.len()];
1798 create_hashes(
1799 &[Arc::clone(&array) as ArrayRef],
1800 &random_state,
1801 &mut hashes1,
1802 )
1803 .unwrap();
1804
1805 let mut hashes2 = vec![0; array.len()];
1806 create_hashes([array], &random_state, &mut hashes2).unwrap();
1807
1808 assert_eq!(hashes1, hashes2);
1809 }
1810
1811 #[test]
1812 fn test_with_hashes() {
1813 let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
1814 let random_state = RandomState::with_seed(0);
1815
1816 let mut expected_hashes = vec![0; array.len()];
1818 create_hashes([&array], &random_state, &mut expected_hashes).unwrap();
1819
1820 let result = with_hashes([&array], &random_state, |hashes| {
1821 assert_eq!(hashes.len(), 4);
1822 assert_eq!(hashes, &expected_hashes[..]);
1824 Ok(hashes.to_vec())
1826 })
1827 .unwrap();
1828
1829 assert_eq!(result, expected_hashes);
1831 }
1832
1833 #[test]
1834 fn test_with_hashes_multi_column() {
1835 let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
1836 let str_array: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
1837 let random_state = RandomState::with_seed(0);
1838
1839 let mut expected_hashes = vec![0; int_array.len()];
1841 create_hashes(
1842 [&int_array, &str_array],
1843 &random_state,
1844 &mut expected_hashes,
1845 )
1846 .unwrap();
1847
1848 with_hashes([&int_array, &str_array], &random_state, |hashes| {
1849 assert_eq!(hashes.len(), 3);
1850 assert_eq!(hashes, &expected_hashes[..]);
1851 Ok(())
1852 })
1853 .unwrap();
1854 }
1855
1856 #[test]
1857 fn test_with_hashes_empty_arrays() {
1858 let random_state = RandomState::with_seed(0);
1859
1860 let empty: [&ArrayRef; 0] = [];
1862 let result = with_hashes(empty, &random_state, |_hashes| Ok(()));
1863
1864 assert!(result.is_err());
1865 assert!(
1866 result
1867 .unwrap_err()
1868 .to_string()
1869 .contains("requires at least one array")
1870 );
1871 }
1872
1873 #[test]
1874 fn test_with_hashes_reentrancy() {
1875 let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
1876 let array2: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6]));
1877 let random_state = RandomState::with_seed(0);
1878
1879 let result = with_hashes([&array], &random_state, |_hashes| {
1881 with_hashes([&array2], &random_state, |_inner_hashes| Ok(()))
1883 });
1884
1885 assert!(result.is_err());
1886 let err_msg = result.unwrap_err().to_string();
1887 assert!(
1888 err_msg.contains("reentrantly") || err_msg.contains("cannot be called"),
1889 "Error message should mention reentrancy: {err_msg}",
1890 );
1891 }
1892
1893 #[test]
1894 #[cfg(not(feature = "force_hash_collisions"))]
1895 fn create_hashes_for_sparse_union_arrays() {
1896 let int_array = Int32Array::from(vec![Some(5), None, Some(10), Some(5)]);
1898 let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
1899
1900 let type_ids = vec![0_i8, 1, 0, 0].into();
1901 let children = vec![
1902 Arc::new(int_array) as ArrayRef,
1903 Arc::new(str_array) as ArrayRef,
1904 ];
1905
1906 let union_fields = [
1907 (0, Arc::new(Field::new("a", DataType::Int32, true))),
1908 (1, Arc::new(Field::new("b", DataType::Utf8, true))),
1909 ]
1910 .into_iter()
1911 .collect();
1912
1913 let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
1914 let array_ref = Arc::new(array) as ArrayRef;
1915
1916 let random_state = RandomState::with_seed(0);
1917 let mut hashes = vec![0; array_ref.len()];
1918 create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1919
1920 assert_eq!(hashes[0], hashes[3]);
1922 assert_ne!(hashes[0], hashes[2]);
1924 assert_ne!(hashes[0], hashes[1]);
1926 }
1927
1928 #[test]
1929 #[cfg(not(feature = "force_hash_collisions"))]
1930 fn create_hashes_for_sparse_union_arrays_with_nulls() {
1931 let int_array = Int32Array::from(vec![Some(5), None, None, None]);
1933 let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
1934
1935 let type_ids = vec![0, 1, 0, 1].into();
1936 let children = vec![
1937 Arc::new(int_array) as ArrayRef,
1938 Arc::new(str_array) as ArrayRef,
1939 ];
1940
1941 let union_fields = [
1942 (0, Arc::new(Field::new("a", DataType::Int32, true))),
1943 (1, Arc::new(Field::new("b", DataType::Utf8, true))),
1944 ]
1945 .into_iter()
1946 .collect();
1947
1948 let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
1949 let array_ref = Arc::new(array) as ArrayRef;
1950
1951 let random_state = RandomState::with_seed(0);
1952 let mut hashes = vec![0; array_ref.len()];
1953 create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1954
1955 assert_eq!(hashes[2], hashes[3]);
1958
1959 assert_ne!(hashes[0], hashes[2]);
1961
1962 assert_ne!(hashes[1], hashes[3]);
1964 }
1965
1966 #[test]
1967 #[cfg(not(feature = "force_hash_collisions"))]
1968 fn create_hashes_for_dense_union_arrays() {
1969 let int_array = Int32Array::from(vec![67, 100, 67]);
1972 let str_array = StringArray::from(vec!["norm", "macdonald"]);
1973
1974 let type_ids = vec![0, 1, 0, 1, 0].into();
1975 let offsets = vec![0, 0, 1, 1, 2].into();
1976 let children = vec![
1977 Arc::new(int_array) as ArrayRef,
1978 Arc::new(str_array) as ArrayRef,
1979 ];
1980
1981 let union_fields = [
1982 (0, Arc::new(Field::new("a", DataType::Int32, false))),
1983 (1, Arc::new(Field::new("b", DataType::Utf8, false))),
1984 ]
1985 .into_iter()
1986 .collect();
1987
1988 let array =
1989 UnionArray::try_new(union_fields, type_ids, Some(offsets), children).unwrap();
1990 let array_ref = Arc::new(array) as ArrayRef;
1991
1992 let random_state = RandomState::with_seed(0);
1993 let mut hashes = vec![0; array_ref.len()];
1994 create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1995
1996 assert_ne!(hashes[0], hashes[1]);
1998 assert_ne!(hashes[0], hashes[2]);
2000 assert_ne!(hashes[1], hashes[3]);
2002 assert_ne!(hashes[2], hashes[3]);
2004 assert_eq!(hashes[0], hashes[4]);
2006 }
2007
2008 #[test]
2009 #[cfg(not(feature = "force_hash_collisions"))]
2010 fn create_hashes_for_sliced_run_array() -> Result<()> {
2011 let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
2012 let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
2013 let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
2014
2015 let random_state = RandomState::with_seed(0);
2016 let mut full_hashes = vec![0; array.len()];
2017 create_hashes(
2018 &[Arc::clone(&array) as ArrayRef],
2019 &random_state,
2020 &mut full_hashes,
2021 )?;
2022
2023 let array_ref: ArrayRef = Arc::clone(&array) as ArrayRef;
2024 let sliced_array = array_ref.slice(2, 3);
2025
2026 let mut sliced_hashes = vec![0; sliced_array.len()];
2027 create_hashes(
2028 std::slice::from_ref(&sliced_array),
2029 &random_state,
2030 &mut sliced_hashes,
2031 )?;
2032
2033 assert_eq!(sliced_hashes.len(), 3);
2034 assert_eq!(sliced_hashes[0], sliced_hashes[1]);
2035 assert_eq!(sliced_hashes[1], sliced_hashes[2]);
2036 assert_eq!(&sliced_hashes, &full_hashes[2..5]);
2037
2038 Ok(())
2039 }
2040
2041 #[test]
2042 #[cfg(not(feature = "force_hash_collisions"))]
2043 fn test_run_array_with_nulls() -> Result<()> {
2044 let values = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
2045 let run_ends = Arc::new(Int32Array::from(vec![2, 4, 6]));
2046 let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
2047
2048 let random_state = RandomState::with_seed(0);
2049 let mut hashes = vec![0; array.len()];
2050 create_hashes(
2051 &[Arc::clone(&array) as ArrayRef],
2052 &random_state,
2053 &mut hashes,
2054 )?;
2055
2056 assert_eq!(hashes[0], hashes[1]);
2057 assert_ne!(hashes[0], 0);
2058 assert_eq!(hashes[2], hashes[3]);
2059 assert_eq!(hashes[2], 0);
2060 assert_eq!(hashes[4], hashes[5]);
2061 assert_ne!(hashes[4], 0);
2062 assert_ne!(hashes[0], hashes[4]);
2063
2064 Ok(())
2065 }
2066
2067 #[test]
2068 #[cfg(not(feature = "force_hash_collisions"))]
2069 fn test_run_array_with_nulls_multicolumn() -> Result<()> {
2070 let primitive_array = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
2071 let run_values = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
2072 let run_ends = Arc::new(Int32Array::from(vec![1, 2, 3]));
2073 let run_array =
2074 Arc::new(RunArray::try_new(&run_ends, run_values.as_ref()).unwrap());
2075 let second_col = Arc::new(Int32Array::from(vec![100, 200, 300]));
2076
2077 let random_state = RandomState::with_seed(0);
2078
2079 let mut primitive_hashes = vec![0; 3];
2080 create_hashes(
2081 &[
2082 Arc::clone(&primitive_array) as ArrayRef,
2083 Arc::clone(&second_col) as ArrayRef,
2084 ],
2085 &random_state,
2086 &mut primitive_hashes,
2087 )?;
2088
2089 let mut run_hashes = vec![0; 3];
2090 create_hashes(
2091 &[
2092 Arc::clone(&run_array) as ArrayRef,
2093 Arc::clone(&second_col) as ArrayRef,
2094 ],
2095 &random_state,
2096 &mut run_hashes,
2097 )?;
2098
2099 assert_eq!(primitive_hashes, run_hashes);
2100
2101 Ok(())
2102 }
2103}