1use ahash::RandomState;
21use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
22use arrow::array::*;
23use arrow::datatypes::*;
24#[cfg(not(feature = "force_hash_collisions"))]
25use arrow::{downcast_dictionary_array, downcast_primitive_array};
26
27#[cfg(not(feature = "force_hash_collisions"))]
28use crate::cast::{
29 as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
30 as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
31 as_string_array, as_string_view_array, as_struct_array,
32};
33use crate::error::Result;
34#[cfg(not(feature = "force_hash_collisions"))]
35use crate::error::_internal_err;
36
37#[inline]
39pub fn combine_hashes(l: u64, r: u64) -> u64 {
40 let hash = (17 * 37u64).wrapping_add(l);
41 hash.wrapping_mul(37).wrapping_add(r)
42}
43
44#[cfg(not(feature = "force_hash_collisions"))]
45fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
46 if mul_col {
47 hashes_buffer.iter_mut().for_each(|hash| {
48 *hash = combine_hashes(random_state.hash_one(1), *hash);
50 })
51 } else {
52 hashes_buffer.iter_mut().for_each(|hash| {
53 *hash = random_state.hash_one(1);
54 })
55 }
56}
57
58pub trait HashValue {
59 fn hash_one(&self, state: &RandomState) -> u64;
60}
61
62impl<T: HashValue + ?Sized> HashValue for &T {
63 fn hash_one(&self, state: &RandomState) -> u64 {
64 T::hash_one(self, state)
65 }
66}
67
68macro_rules! hash_value {
69 ($($t:ty),+) => {
70 $(impl HashValue for $t {
71 fn hash_one(&self, state: &RandomState) -> u64 {
72 state.hash_one(self)
73 }
74 })+
75 };
76}
77hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
78hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
79
80macro_rules! hash_float_value {
81 ($(($t:ty, $i:ty)),+) => {
82 $(impl HashValue for $t {
83 fn hash_one(&self, state: &RandomState) -> u64 {
84 state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
85 }
86 })+
87 };
88}
89hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
90
91#[cfg(not(feature = "force_hash_collisions"))]
95fn hash_array_primitive<T>(
96 array: &PrimitiveArray<T>,
97 random_state: &RandomState,
98 hashes_buffer: &mut [u64],
99 rehash: bool,
100) where
101 T: ArrowPrimitiveType<Native: HashValue>,
102{
103 assert_eq!(
104 hashes_buffer.len(),
105 array.len(),
106 "hashes_buffer and array should be of equal length"
107 );
108
109 if array.null_count() == 0 {
110 if rehash {
111 for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
112 *hash = combine_hashes(value.hash_one(random_state), *hash);
113 }
114 } else {
115 for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
116 *hash = value.hash_one(random_state);
117 }
118 }
119 } else if rehash {
120 for (i, hash) in hashes_buffer.iter_mut().enumerate() {
121 if !array.is_null(i) {
122 let value = unsafe { array.value_unchecked(i) };
123 *hash = combine_hashes(value.hash_one(random_state), *hash);
124 }
125 }
126 } else {
127 for (i, hash) in hashes_buffer.iter_mut().enumerate() {
128 if !array.is_null(i) {
129 let value = unsafe { array.value_unchecked(i) };
130 *hash = value.hash_one(random_state);
131 }
132 }
133 }
134}
135
136#[cfg(not(feature = "force_hash_collisions"))]
140fn hash_array<T>(
141 array: &T,
142 random_state: &RandomState,
143 hashes_buffer: &mut [u64],
144 rehash: bool,
145) where
146 T: ArrayAccessor,
147 T::Item: HashValue,
148{
149 assert_eq!(
150 hashes_buffer.len(),
151 array.len(),
152 "hashes_buffer and array should be of equal length"
153 );
154
155 if array.null_count() == 0 {
156 if rehash {
157 for (i, hash) in hashes_buffer.iter_mut().enumerate() {
158 let value = unsafe { array.value_unchecked(i) };
159 *hash = combine_hashes(value.hash_one(random_state), *hash);
160 }
161 } else {
162 for (i, hash) in hashes_buffer.iter_mut().enumerate() {
163 let value = unsafe { array.value_unchecked(i) };
164 *hash = value.hash_one(random_state);
165 }
166 }
167 } else if rehash {
168 for (i, hash) in hashes_buffer.iter_mut().enumerate() {
169 if !array.is_null(i) {
170 let value = unsafe { array.value_unchecked(i) };
171 *hash = combine_hashes(value.hash_one(random_state), *hash);
172 }
173 }
174 } else {
175 for (i, hash) in hashes_buffer.iter_mut().enumerate() {
176 if !array.is_null(i) {
177 let value = unsafe { array.value_unchecked(i) };
178 *hash = value.hash_one(random_state);
179 }
180 }
181 }
182}
183
184#[cfg(not(feature = "force_hash_collisions"))]
186#[inline]
187fn update_hash_for_dict_key(
188 hash: &mut u64,
189 dict_hashes: &[u64],
190 dict_values: &dyn Array,
191 idx: usize,
192 multi_col: bool,
193) {
194 if dict_values.is_valid(idx) {
195 if multi_col {
196 *hash = combine_hashes(dict_hashes[idx], *hash);
197 } else {
198 *hash = dict_hashes[idx];
199 }
200 }
201 }
203
204#[cfg(not(feature = "force_hash_collisions"))]
206fn hash_dictionary<K: ArrowDictionaryKeyType>(
207 array: &DictionaryArray<K>,
208 random_state: &RandomState,
209 hashes_buffer: &mut [u64],
210 multi_col: bool,
211) -> Result<()> {
212 let dict_values = array.values();
216 let mut dict_hashes = vec![0; dict_values.len()];
217 create_hashes([dict_values], random_state, &mut dict_hashes)?;
218
219 for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
221 if let Some(key) = key {
222 let idx = key.as_usize();
223 update_hash_for_dict_key(
224 hash,
225 &dict_hashes,
226 dict_values.as_ref(),
227 idx,
228 multi_col,
229 );
230 } }
232 Ok(())
233}
234
235#[cfg(not(feature = "force_hash_collisions"))]
236fn hash_struct_array(
237 array: &StructArray,
238 random_state: &RandomState,
239 hashes_buffer: &mut [u64],
240) -> Result<()> {
241 let nulls = array.nulls();
242 let row_len = array.len();
243
244 let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
245 nulls.valid_indices().collect()
246 } else {
247 (0..row_len).collect()
248 };
249
250 let mut values_hashes = vec![0u64; row_len];
252 create_hashes(array.columns(), random_state, &mut values_hashes)?;
253
254 for i in valid_row_indices {
255 let hash = &mut hashes_buffer[i];
256 *hash = combine_hashes(*hash, values_hashes[i]);
257 }
258
259 Ok(())
260}
261
262#[cfg(not(feature = "force_hash_collisions"))]
264fn hash_map_array(
265 array: &MapArray,
266 random_state: &RandomState,
267 hashes_buffer: &mut [u64],
268) -> Result<()> {
269 let nulls = array.nulls();
270 let offsets = array.offsets();
271
272 let mut values_hashes = vec![0u64; array.entries().len()];
274 create_hashes(array.entries().columns(), random_state, &mut values_hashes)?;
275
276 if let Some(nulls) = nulls {
278 for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
279 if nulls.is_valid(i) {
280 let hash = &mut hashes_buffer[i];
281 for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
282 *hash = combine_hashes(*hash, *values_hash);
283 }
284 }
285 }
286 } else {
287 for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
288 let hash = &mut hashes_buffer[i];
289 for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
290 *hash = combine_hashes(*hash, *values_hash);
291 }
292 }
293 }
294
295 Ok(())
296}
297
298#[cfg(not(feature = "force_hash_collisions"))]
299fn hash_list_array<OffsetSize>(
300 array: &GenericListArray<OffsetSize>,
301 random_state: &RandomState,
302 hashes_buffer: &mut [u64],
303) -> Result<()>
304where
305 OffsetSize: OffsetSizeTrait,
306{
307 let values = array.values();
308 let offsets = array.value_offsets();
309 let nulls = array.nulls();
310 let mut values_hashes = vec![0u64; values.len()];
311 create_hashes([values], random_state, &mut values_hashes)?;
312 if let Some(nulls) = nulls {
313 for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
314 if nulls.is_valid(i) {
315 let hash = &mut hashes_buffer[i];
316 for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
317 *hash = combine_hashes(*hash, *values_hash);
318 }
319 }
320 }
321 } else {
322 for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
323 let hash = &mut hashes_buffer[i];
324 for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
325 *hash = combine_hashes(*hash, *values_hash);
326 }
327 }
328 }
329 Ok(())
330}
331
332#[cfg(not(feature = "force_hash_collisions"))]
333fn hash_fixed_list_array(
334 array: &FixedSizeListArray,
335 random_state: &RandomState,
336 hashes_buffer: &mut [u64],
337) -> Result<()> {
338 let values = array.values();
339 let value_length = array.value_length() as usize;
340 let nulls = array.nulls();
341 let mut values_hashes = vec![0u64; values.len()];
342 create_hashes([values], random_state, &mut values_hashes)?;
343 if let Some(nulls) = nulls {
344 for i in 0..array.len() {
345 if nulls.is_valid(i) {
346 let hash = &mut hashes_buffer[i];
347 for values_hash in
348 &values_hashes[i * value_length..(i + 1) * value_length]
349 {
350 *hash = combine_hashes(*hash, *values_hash);
351 }
352 }
353 }
354 } else {
355 for i in 0..array.len() {
356 let hash = &mut hashes_buffer[i];
357 for values_hash in &values_hashes[i * value_length..(i + 1) * value_length] {
358 *hash = combine_hashes(*hash, *values_hash);
359 }
360 }
361 }
362 Ok(())
363}
364
365#[cfg(not(feature = "force_hash_collisions"))]
368fn hash_single_array(
369 array: &dyn Array,
370 random_state: &RandomState,
371 hashes_buffer: &mut [u64],
372 rehash: bool,
373) -> Result<()> {
374 downcast_primitive_array! {
375 array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
376 DataType::Null => hash_null(random_state, hashes_buffer, rehash),
377 DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash),
378 DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash),
379 DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash),
380 DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash),
381 DataType::Binary => hash_array(&as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
382 DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
383 DataType::LargeBinary => hash_array(&as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
384 DataType::FixedSizeBinary(_) => {
385 let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
386 hash_array(&array, random_state, hashes_buffer, rehash)
387 }
388 DataType::Dictionary(_, _) => downcast_dictionary_array! {
389 array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
390 _ => unreachable!()
391 }
392 DataType::Struct(_) => {
393 let array = as_struct_array(array)?;
394 hash_struct_array(array, random_state, hashes_buffer)?;
395 }
396 DataType::List(_) => {
397 let array = as_list_array(array)?;
398 hash_list_array(array, random_state, hashes_buffer)?;
399 }
400 DataType::LargeList(_) => {
401 let array = as_large_list_array(array)?;
402 hash_list_array(array, random_state, hashes_buffer)?;
403 }
404 DataType::Map(_, _) => {
405 let array = as_map_array(array)?;
406 hash_map_array(array, random_state, hashes_buffer)?;
407 }
408 DataType::FixedSizeList(_,_) => {
409 let array = as_fixed_size_list_array(array)?;
410 hash_fixed_list_array(array, random_state, hashes_buffer)?;
411 }
412 _ => {
413 return _internal_err!(
415 "Unsupported data type in hasher: {}",
416 array.data_type()
417 );
418 }
419 }
420 Ok(())
421}
422
423#[cfg(feature = "force_hash_collisions")]
425fn hash_single_array(
426 _array: &dyn Array,
427 _random_state: &RandomState,
428 hashes_buffer: &mut [u64],
429 _rehash: bool,
430) -> Result<()> {
431 for hash in hashes_buffer.iter_mut() {
432 *hash = 0
433 }
434 Ok(())
435}
436
437pub trait AsDynArray {
447 fn as_dyn_array(&self) -> &dyn Array;
448}
449
450impl AsDynArray for dyn Array {
451 fn as_dyn_array(&self) -> &dyn Array {
452 self
453 }
454}
455
456impl AsDynArray for &dyn Array {
457 fn as_dyn_array(&self) -> &dyn Array {
458 *self
459 }
460}
461
462impl AsDynArray for ArrayRef {
463 fn as_dyn_array(&self) -> &dyn Array {
464 self.as_ref()
465 }
466}
467
468impl AsDynArray for &ArrayRef {
469 fn as_dyn_array(&self) -> &dyn Array {
470 self.as_ref()
471 }
472}
473
474pub fn create_hashes<'a, I, T>(
479 arrays: I,
480 random_state: &RandomState,
481 hashes_buffer: &'a mut Vec<u64>,
482) -> Result<&'a mut Vec<u64>>
483where
484 I: IntoIterator<Item = T>,
485 T: AsDynArray,
486{
487 for (i, array) in arrays.into_iter().enumerate() {
488 let rehash = i >= 1;
490 hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?;
491 }
492 Ok(hashes_buffer)
493}
494
495#[cfg(test)]
496mod tests {
497 use std::sync::Arc;
498
499 use arrow::array::*;
500 #[cfg(not(feature = "force_hash_collisions"))]
501 use arrow::datatypes::*;
502
503 use super::*;
504
505 #[test]
506 fn create_hashes_for_decimal_array() -> Result<()> {
507 let array = vec![1, 2, 3, 4]
508 .into_iter()
509 .map(Some)
510 .collect::<Decimal128Array>()
511 .with_precision_and_scale(20, 3)
512 .unwrap();
513 let array_ref: ArrayRef = Arc::new(array);
514 let random_state = RandomState::with_seeds(0, 0, 0, 0);
515 let hashes_buff = &mut vec![0; array_ref.len()];
516 let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
517 assert_eq!(hashes.len(), 4);
518 Ok(())
519 }
520
521 #[test]
522 fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
523 let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
524 let random_state = RandomState::with_seeds(0, 0, 0, 0);
525 let hashes_buff = &mut vec![0; 0];
526 let hashes = create_hashes(
527 &[Arc::new(empty_array) as ArrayRef],
528 &random_state,
529 hashes_buff,
530 )?;
531 assert_eq!(hashes, &Vec::<u64>::new());
532 Ok(())
533 }
534
535 #[test]
536 fn create_hashes_for_float_arrays() -> Result<()> {
537 let f32_arr: ArrayRef =
538 Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
539 let f64_arr: ArrayRef =
540 Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
541
542 let random_state = RandomState::with_seeds(0, 0, 0, 0);
543 let hashes_buff = &mut vec![0; f32_arr.len()];
544 let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
545 assert_eq!(hashes.len(), 4,);
546
547 let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?;
548 assert_eq!(hashes.len(), 4,);
549
550 Ok(())
551 }
552
553 macro_rules! create_hash_binary {
554 ($NAME:ident, $ARRAY:ty) => {
555 #[cfg(not(feature = "force_hash_collisions"))]
556 #[test]
557 fn $NAME() {
558 let binary = [
559 Some(b"short".to_byte_slice()),
560 None,
561 Some(b"long but different 12 bytes string"),
562 Some(b"short2"),
563 Some(b"Longer than 12 bytes string"),
564 Some(b"short"),
565 Some(b"Longer than 12 bytes string"),
566 ];
567
568 let binary_array: ArrayRef =
569 Arc::new(binary.iter().cloned().collect::<$ARRAY>());
570 let ref_array: ArrayRef =
571 Arc::new(binary.iter().cloned().collect::<BinaryArray>());
572
573 let random_state = RandomState::with_seeds(0, 0, 0, 0);
574
575 let mut binary_hashes = vec![0; binary.len()];
576 create_hashes(&[binary_array], &random_state, &mut binary_hashes)
577 .unwrap();
578
579 let mut ref_hashes = vec![0; binary.len()];
580 create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
581
582 for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
584 match val {
585 Some(_) => assert_ne!(*hash, 0),
586 None => assert_eq!(*hash, 0),
587 }
588 }
589
590 assert_eq!(binary_hashes, ref_hashes);
592
593 assert_eq!(binary[0], binary[5]);
595 assert_eq!(binary[4], binary[6]);
596
597 assert_ne!(binary[0], binary[2]);
599 }
600 };
601 }
602
603 create_hash_binary!(binary_array, BinaryArray);
604 create_hash_binary!(binary_view_array, BinaryViewArray);
605
606 #[test]
607 fn create_hashes_fixed_size_binary() -> Result<()> {
608 let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
609 let fixed_size_binary_array: ArrayRef =
610 Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
611
612 let random_state = RandomState::with_seeds(0, 0, 0, 0);
613 let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
614 let hashes =
615 create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
616 assert_eq!(hashes.len(), 3,);
617
618 Ok(())
619 }
620
621 macro_rules! create_hash_string {
622 ($NAME:ident, $ARRAY:ty) => {
623 #[cfg(not(feature = "force_hash_collisions"))]
624 #[test]
625 fn $NAME() {
626 let strings = [
627 Some("short"),
628 None,
629 Some("long but different 12 bytes string"),
630 Some("short2"),
631 Some("Longer than 12 bytes string"),
632 Some("short"),
633 Some("Longer than 12 bytes string"),
634 ];
635
636 let string_array: ArrayRef =
637 Arc::new(strings.iter().cloned().collect::<$ARRAY>());
638 let dict_array: ArrayRef = Arc::new(
639 strings
640 .iter()
641 .cloned()
642 .collect::<DictionaryArray<Int8Type>>(),
643 );
644
645 let random_state = RandomState::with_seeds(0, 0, 0, 0);
646
647 let mut string_hashes = vec![0; strings.len()];
648 create_hashes(&[string_array], &random_state, &mut string_hashes)
649 .unwrap();
650
651 let mut dict_hashes = vec![0; strings.len()];
652 create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
653
654 for (val, hash) in strings.iter().zip(string_hashes.iter()) {
656 match val {
657 Some(_) => assert_ne!(*hash, 0),
658 None => assert_eq!(*hash, 0),
659 }
660 }
661
662 assert_eq!(string_hashes, dict_hashes);
664
665 assert_eq!(strings[0], strings[5]);
667 assert_eq!(strings[4], strings[6]);
668
669 assert_ne!(strings[0], strings[2]);
671 }
672 };
673 }
674
675 create_hash_string!(string_array, StringArray);
676 create_hash_string!(large_string_array, LargeStringArray);
677 create_hash_string!(string_view_array, StringArray);
678 create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
679
680 #[test]
681 #[cfg(not(feature = "force_hash_collisions"))]
683 fn create_hashes_for_dict_arrays() {
684 let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
685
686 let string_array: ArrayRef =
687 Arc::new(strings.iter().cloned().collect::<StringArray>());
688 let dict_array: ArrayRef = Arc::new(
689 strings
690 .iter()
691 .cloned()
692 .collect::<DictionaryArray<Int8Type>>(),
693 );
694
695 let random_state = RandomState::with_seeds(0, 0, 0, 0);
696
697 let mut string_hashes = vec![0; strings.len()];
698 create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
699
700 let mut dict_hashes = vec![0; strings.len()];
701 create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
702
703 for (val, hash) in strings.iter().zip(string_hashes.iter()) {
705 match val {
706 Some(_) => assert_ne!(*hash, 0),
707 None => assert_eq!(*hash, 0),
708 }
709 }
710
711 assert_eq!(string_hashes, dict_hashes);
713
714 assert_eq!(strings[1], strings[4]);
716 assert_eq!(dict_hashes[1], dict_hashes[4]);
717 assert_eq!(strings[0], strings[3]);
718 assert_eq!(dict_hashes[0], dict_hashes[3]);
719
720 assert_ne!(strings[0], strings[2]);
722 assert_ne!(dict_hashes[0], dict_hashes[2]);
723 }
724
725 #[test]
726 #[cfg(not(feature = "force_hash_collisions"))]
728 fn create_hashes_for_list_arrays() {
729 let data = vec![
730 Some(vec![Some(0), Some(1), Some(2)]),
731 None,
732 Some(vec![Some(3), None, Some(5)]),
733 Some(vec![Some(3), None, Some(5)]),
734 None,
735 Some(vec![Some(0), Some(1), Some(2)]),
736 Some(vec![]),
737 ];
738 let list_array =
739 Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
740 let random_state = RandomState::with_seeds(0, 0, 0, 0);
741 let mut hashes = vec![0; list_array.len()];
742 create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
743 assert_eq!(hashes[0], hashes[5]);
744 assert_eq!(hashes[1], hashes[4]);
745 assert_eq!(hashes[2], hashes[3]);
746 assert_eq!(hashes[1], hashes[6]); }
748
749 #[test]
750 #[cfg(not(feature = "force_hash_collisions"))]
752 fn create_hashes_for_fixed_size_list_arrays() {
753 let data = vec![
754 Some(vec![Some(0), Some(1), Some(2)]),
755 None,
756 Some(vec![Some(3), None, Some(5)]),
757 Some(vec![Some(3), None, Some(5)]),
758 None,
759 Some(vec![Some(0), Some(1), Some(2)]),
760 ];
761 let list_array =
762 Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
763 data, 3,
764 )) as ArrayRef;
765 let random_state = RandomState::with_seeds(0, 0, 0, 0);
766 let mut hashes = vec![0; list_array.len()];
767 create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
768 assert_eq!(hashes[0], hashes[5]);
769 assert_eq!(hashes[1], hashes[4]);
770 assert_eq!(hashes[2], hashes[3]);
771 }
772
773 #[test]
774 #[cfg(not(feature = "force_hash_collisions"))]
776 fn create_hashes_for_struct_arrays() {
777 use arrow::buffer::Buffer;
778
779 let boolarr = Arc::new(BooleanArray::from(vec![
780 false, false, true, true, true, true,
781 ]));
782 let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31]));
783
784 let struct_array = StructArray::from((
785 vec![
786 (
787 Arc::new(Field::new("bool", DataType::Boolean, false)),
788 Arc::clone(&boolarr) as ArrayRef,
789 ),
790 (
791 Arc::new(Field::new("i32", DataType::Int32, false)),
792 Arc::clone(&i32arr) as ArrayRef,
793 ),
794 (
795 Arc::new(Field::new("i32", DataType::Int32, false)),
796 Arc::clone(&i32arr) as ArrayRef,
797 ),
798 (
799 Arc::new(Field::new("bool", DataType::Boolean, false)),
800 Arc::clone(&boolarr) as ArrayRef,
801 ),
802 ],
803 Buffer::from(&[0b001011]),
804 ));
805
806 assert!(struct_array.is_valid(0));
807 assert!(struct_array.is_valid(1));
808 assert!(struct_array.is_null(2));
809 assert!(struct_array.is_valid(3));
810 assert!(struct_array.is_null(4));
811 assert!(struct_array.is_null(5));
812
813 let array = Arc::new(struct_array) as ArrayRef;
814
815 let random_state = RandomState::with_seeds(0, 0, 0, 0);
816 let mut hashes = vec![0; array.len()];
817 create_hashes(&[array], &random_state, &mut hashes).unwrap();
818 assert_eq!(hashes[0], hashes[1]);
819 assert_ne!(hashes[2], hashes[3]);
821 assert_eq!(hashes[4], hashes[5]);
823 }
824
825 #[test]
826 #[cfg(not(feature = "force_hash_collisions"))]
828 fn create_hashes_for_struct_arrays_more_column_than_row() {
829 let struct_array = StructArray::from(vec![
830 (
831 Arc::new(Field::new("bool", DataType::Boolean, false)),
832 Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
833 ),
834 (
835 Arc::new(Field::new("i32-1", DataType::Int32, false)),
836 Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
837 ),
838 (
839 Arc::new(Field::new("i32-2", DataType::Int32, false)),
840 Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
841 ),
842 (
843 Arc::new(Field::new("i32-3", DataType::Int32, false)),
844 Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
845 ),
846 ]);
847
848 assert!(struct_array.is_valid(0));
849 assert!(struct_array.is_valid(1));
850
851 let array = Arc::new(struct_array) as ArrayRef;
852 let random_state = RandomState::with_seeds(0, 0, 0, 0);
853 let mut hashes = vec![0; array.len()];
854 create_hashes(&[array], &random_state, &mut hashes).unwrap();
855 assert_eq!(hashes[0], hashes[1]);
856 }
857
858 #[test]
859 #[cfg(not(feature = "force_hash_collisions"))]
861 fn create_hashes_for_map_arrays() {
862 let mut builder =
863 MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
864 builder.keys().append_value("key1");
866 builder.keys().append_value("key2");
867 builder.values().append_value(1);
868 builder.values().append_value(2);
869 builder.append(true).unwrap();
870 builder.keys().append_value("key1");
872 builder.keys().append_value("key2");
873 builder.values().append_value(1);
874 builder.values().append_value(2);
875 builder.append(true).unwrap();
876 builder.keys().append_value("key1");
878 builder.keys().append_value("key2");
879 builder.values().append_value(1);
880 builder.values().append_value(3);
881 builder.append(true).unwrap();
882 builder.keys().append_value("key1");
884 builder.keys().append_value("key3");
885 builder.values().append_value(1);
886 builder.values().append_value(2);
887 builder.append(true).unwrap();
888 builder.keys().append_value("key1");
890 builder.values().append_value(1);
891 builder.append(true).unwrap();
892 builder.keys().append_value("key1");
894 builder.values().append_null();
895 builder.append(true).unwrap();
896 builder.append(true).unwrap();
898 builder.keys().append_value("key1");
900 builder.values().append_value(1);
901 builder.append(false).unwrap();
902
903 let array = Arc::new(builder.finish()) as ArrayRef;
904
905 let random_state = RandomState::with_seeds(0, 0, 0, 0);
906 let mut hashes = vec![0; array.len()];
907 create_hashes(&[array], &random_state, &mut hashes).unwrap();
908 assert_eq!(hashes[0], hashes[1]); assert_ne!(hashes[0], hashes[2]); assert_ne!(hashes[0], hashes[3]); assert_ne!(hashes[0], hashes[4]); assert_ne!(hashes[4], hashes[5]); assert_eq!(hashes[6], hashes[7]); }
915
916 #[test]
917 #[cfg(not(feature = "force_hash_collisions"))]
919 fn create_multi_column_hash_for_dict_arrays() {
920 let strings1 = [Some("foo"), None, Some("bar")];
921 let strings2 = [Some("blarg"), Some("blah"), None];
922
923 let string_array: ArrayRef =
924 Arc::new(strings1.iter().cloned().collect::<StringArray>());
925 let dict_array: ArrayRef = Arc::new(
926 strings2
927 .iter()
928 .cloned()
929 .collect::<DictionaryArray<Int32Type>>(),
930 );
931
932 let random_state = RandomState::with_seeds(0, 0, 0, 0);
933
934 let mut one_col_hashes = vec![0; strings1.len()];
935 create_hashes(
936 &[Arc::clone(&dict_array) as ArrayRef],
937 &random_state,
938 &mut one_col_hashes,
939 )
940 .unwrap();
941
942 let mut two_col_hashes = vec![0; strings1.len()];
943 create_hashes(
944 &[dict_array, string_array],
945 &random_state,
946 &mut two_col_hashes,
947 )
948 .unwrap();
949
950 assert_eq!(one_col_hashes.len(), 3);
951 assert_eq!(two_col_hashes.len(), 3);
952
953 assert_ne!(one_col_hashes, two_col_hashes);
954 }
955
956 #[test]
957 fn test_create_hashes_from_arrays() {
958 let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
959 let float_array: ArrayRef =
960 Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
961
962 let random_state = RandomState::with_seeds(0, 0, 0, 0);
963 let hashes_buff = &mut vec![0; int_array.len()];
964 let hashes =
965 create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap();
966 assert_eq!(hashes.len(), 4,);
967 }
968
969 #[test]
970 fn test_create_hashes_from_dyn_arrays() {
971 let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
972 let float_array: ArrayRef =
973 Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
974
975 fn test(arr1: &dyn Array, arr2: &dyn Array) {
977 let random_state = RandomState::with_seeds(0, 0, 0, 0);
978 let hashes_buff = &mut vec![0; arr1.len()];
979 let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap();
980 assert_eq!(hashes.len(), 4,);
981 }
982 test(&*int_array, &*float_array);
983 }
984
985 #[test]
986 fn test_create_hashes_equivalence() {
987 let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
988 let random_state = RandomState::with_seeds(0, 0, 0, 0);
989
990 let mut hashes1 = vec![0; array.len()];
991 create_hashes(
992 &[Arc::clone(&array) as ArrayRef],
993 &random_state,
994 &mut hashes1,
995 )
996 .unwrap();
997
998 let mut hashes2 = vec![0; array.len()];
999 create_hashes([array], &random_state, &mut hashes2).unwrap();
1000
1001 assert_eq!(hashes1, hashes2);
1002 }
1003}