datafusion_common/
hash_utils.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Functionality used both on logical and physical plans
19
20use ahash::RandomState;
21use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
22use arrow::array::*;
23use arrow::datatypes::*;
24#[cfg(not(feature = "force_hash_collisions"))]
25use arrow::{downcast_dictionary_array, downcast_primitive_array};
26
27#[cfg(not(feature = "force_hash_collisions"))]
28use crate::cast::{
29    as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
30    as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
31    as_string_array, as_string_view_array, as_struct_array,
32};
33use crate::error::Result;
34#[cfg(not(feature = "force_hash_collisions"))]
35use crate::error::_internal_err;
36
37// Combines two hashes into one hash
38#[inline]
39pub fn combine_hashes(l: u64, r: u64) -> u64 {
40    let hash = (17 * 37u64).wrapping_add(l);
41    hash.wrapping_mul(37).wrapping_add(r)
42}
43
44#[cfg(not(feature = "force_hash_collisions"))]
45fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
46    if mul_col {
47        hashes_buffer.iter_mut().for_each(|hash| {
48            // stable hash for null value
49            *hash = combine_hashes(random_state.hash_one(1), *hash);
50        })
51    } else {
52        hashes_buffer.iter_mut().for_each(|hash| {
53            *hash = random_state.hash_one(1);
54        })
55    }
56}
57
58pub trait HashValue {
59    fn hash_one(&self, state: &RandomState) -> u64;
60}
61
62impl<T: HashValue + ?Sized> HashValue for &T {
63    fn hash_one(&self, state: &RandomState) -> u64 {
64        T::hash_one(self, state)
65    }
66}
67
68macro_rules! hash_value {
69    ($($t:ty),+) => {
70        $(impl HashValue for $t {
71            fn hash_one(&self, state: &RandomState) -> u64 {
72                state.hash_one(self)
73            }
74        })+
75    };
76}
77hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
78hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
79
80macro_rules! hash_float_value {
81    ($(($t:ty, $i:ty)),+) => {
82        $(impl HashValue for $t {
83            fn hash_one(&self, state: &RandomState) -> u64 {
84                state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
85            }
86        })+
87    };
88}
89hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
90
91/// Builds hash values of PrimitiveArray and writes them into `hashes_buffer`
92/// If `rehash==true` this combines the previous hash value in the buffer
93/// with the new hash using `combine_hashes`
94#[cfg(not(feature = "force_hash_collisions"))]
95fn hash_array_primitive<T>(
96    array: &PrimitiveArray<T>,
97    random_state: &RandomState,
98    hashes_buffer: &mut [u64],
99    rehash: bool,
100) where
101    T: ArrowPrimitiveType<Native: HashValue>,
102{
103    assert_eq!(
104        hashes_buffer.len(),
105        array.len(),
106        "hashes_buffer and array should be of equal length"
107    );
108
109    if array.null_count() == 0 {
110        if rehash {
111            for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
112                *hash = combine_hashes(value.hash_one(random_state), *hash);
113            }
114        } else {
115            for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
116                *hash = value.hash_one(random_state);
117            }
118        }
119    } else if rehash {
120        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
121            if !array.is_null(i) {
122                let value = unsafe { array.value_unchecked(i) };
123                *hash = combine_hashes(value.hash_one(random_state), *hash);
124            }
125        }
126    } else {
127        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
128            if !array.is_null(i) {
129                let value = unsafe { array.value_unchecked(i) };
130                *hash = value.hash_one(random_state);
131            }
132        }
133    }
134}
135
136/// Hashes one array into the `hashes_buffer`
137/// If `rehash==true` this combines the previous hash value in the buffer
138/// with the new hash using `combine_hashes`
139#[cfg(not(feature = "force_hash_collisions"))]
140fn hash_array<T>(
141    array: &T,
142    random_state: &RandomState,
143    hashes_buffer: &mut [u64],
144    rehash: bool,
145) where
146    T: ArrayAccessor,
147    T::Item: HashValue,
148{
149    assert_eq!(
150        hashes_buffer.len(),
151        array.len(),
152        "hashes_buffer and array should be of equal length"
153    );
154
155    if array.null_count() == 0 {
156        if rehash {
157            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
158                let value = unsafe { array.value_unchecked(i) };
159                *hash = combine_hashes(value.hash_one(random_state), *hash);
160            }
161        } else {
162            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
163                let value = unsafe { array.value_unchecked(i) };
164                *hash = value.hash_one(random_state);
165            }
166        }
167    } else if rehash {
168        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
169            if !array.is_null(i) {
170                let value = unsafe { array.value_unchecked(i) };
171                *hash = combine_hashes(value.hash_one(random_state), *hash);
172            }
173        }
174    } else {
175        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
176            if !array.is_null(i) {
177                let value = unsafe { array.value_unchecked(i) };
178                *hash = value.hash_one(random_state);
179            }
180        }
181    }
182}
183
184/// Helper function to update hash for a dictionary key if the value is valid
185#[cfg(not(feature = "force_hash_collisions"))]
186#[inline]
187fn update_hash_for_dict_key(
188    hash: &mut u64,
189    dict_hashes: &[u64],
190    dict_values: &dyn Array,
191    idx: usize,
192    multi_col: bool,
193) {
194    if dict_values.is_valid(idx) {
195        if multi_col {
196            *hash = combine_hashes(dict_hashes[idx], *hash);
197        } else {
198            *hash = dict_hashes[idx];
199        }
200    }
201    // no update for invalid dictionary value
202}
203
204/// Hash the values in a dictionary array
205#[cfg(not(feature = "force_hash_collisions"))]
206fn hash_dictionary<K: ArrowDictionaryKeyType>(
207    array: &DictionaryArray<K>,
208    random_state: &RandomState,
209    hashes_buffer: &mut [u64],
210    multi_col: bool,
211) -> Result<()> {
212    // Hash each dictionary value once, and then use that computed
213    // hash for each key value to avoid a potentially expensive
214    // redundant hashing for large dictionary elements (e.g. strings)
215    let dict_values = array.values();
216    let mut dict_hashes = vec![0; dict_values.len()];
217    create_hashes([dict_values], random_state, &mut dict_hashes)?;
218
219    // combine hash for each index in values
220    for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
221        if let Some(key) = key {
222            let idx = key.as_usize();
223            update_hash_for_dict_key(
224                hash,
225                &dict_hashes,
226                dict_values.as_ref(),
227                idx,
228                multi_col,
229            );
230        } // no update for Null key
231    }
232    Ok(())
233}
234
235#[cfg(not(feature = "force_hash_collisions"))]
236fn hash_struct_array(
237    array: &StructArray,
238    random_state: &RandomState,
239    hashes_buffer: &mut [u64],
240) -> Result<()> {
241    let nulls = array.nulls();
242    let row_len = array.len();
243
244    let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
245        nulls.valid_indices().collect()
246    } else {
247        (0..row_len).collect()
248    };
249
250    // Create hashes for each row that combines the hashes over all the column at that row.
251    let mut values_hashes = vec![0u64; row_len];
252    create_hashes(array.columns(), random_state, &mut values_hashes)?;
253
254    for i in valid_row_indices {
255        let hash = &mut hashes_buffer[i];
256        *hash = combine_hashes(*hash, values_hashes[i]);
257    }
258
259    Ok(())
260}
261
262// only adding this `cfg` b/c this function is only used with this `cfg`
263#[cfg(not(feature = "force_hash_collisions"))]
264fn hash_map_array(
265    array: &MapArray,
266    random_state: &RandomState,
267    hashes_buffer: &mut [u64],
268) -> Result<()> {
269    let nulls = array.nulls();
270    let offsets = array.offsets();
271
272    // Create hashes for each entry in each row
273    let mut values_hashes = vec![0u64; array.entries().len()];
274    create_hashes(array.entries().columns(), random_state, &mut values_hashes)?;
275
276    // Combine the hashes for entries on each row with each other and previous hash for that row
277    if let Some(nulls) = nulls {
278        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
279            if nulls.is_valid(i) {
280                let hash = &mut hashes_buffer[i];
281                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
282                    *hash = combine_hashes(*hash, *values_hash);
283                }
284            }
285        }
286    } else {
287        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
288            let hash = &mut hashes_buffer[i];
289            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
290                *hash = combine_hashes(*hash, *values_hash);
291            }
292        }
293    }
294
295    Ok(())
296}
297
298#[cfg(not(feature = "force_hash_collisions"))]
299fn hash_list_array<OffsetSize>(
300    array: &GenericListArray<OffsetSize>,
301    random_state: &RandomState,
302    hashes_buffer: &mut [u64],
303) -> Result<()>
304where
305    OffsetSize: OffsetSizeTrait,
306{
307    let values = array.values();
308    let offsets = array.value_offsets();
309    let nulls = array.nulls();
310    let mut values_hashes = vec![0u64; values.len()];
311    create_hashes([values], random_state, &mut values_hashes)?;
312    if let Some(nulls) = nulls {
313        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
314            if nulls.is_valid(i) {
315                let hash = &mut hashes_buffer[i];
316                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
317                    *hash = combine_hashes(*hash, *values_hash);
318                }
319            }
320        }
321    } else {
322        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
323            let hash = &mut hashes_buffer[i];
324            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
325                *hash = combine_hashes(*hash, *values_hash);
326            }
327        }
328    }
329    Ok(())
330}
331
332#[cfg(not(feature = "force_hash_collisions"))]
333fn hash_fixed_list_array(
334    array: &FixedSizeListArray,
335    random_state: &RandomState,
336    hashes_buffer: &mut [u64],
337) -> Result<()> {
338    let values = array.values();
339    let value_length = array.value_length() as usize;
340    let nulls = array.nulls();
341    let mut values_hashes = vec![0u64; values.len()];
342    create_hashes([values], random_state, &mut values_hashes)?;
343    if let Some(nulls) = nulls {
344        for i in 0..array.len() {
345            if nulls.is_valid(i) {
346                let hash = &mut hashes_buffer[i];
347                for values_hash in
348                    &values_hashes[i * value_length..(i + 1) * value_length]
349                {
350                    *hash = combine_hashes(*hash, *values_hash);
351                }
352            }
353        }
354    } else {
355        for i in 0..array.len() {
356            let hash = &mut hashes_buffer[i];
357            for values_hash in &values_hashes[i * value_length..(i + 1) * value_length] {
358                *hash = combine_hashes(*hash, *values_hash);
359            }
360        }
361    }
362    Ok(())
363}
364
365/// Internal helper function that hashes a single array and either initializes or combines
366/// the hash values in the buffer.
367#[cfg(not(feature = "force_hash_collisions"))]
368fn hash_single_array(
369    array: &dyn Array,
370    random_state: &RandomState,
371    hashes_buffer: &mut [u64],
372    rehash: bool,
373) -> Result<()> {
374    downcast_primitive_array! {
375        array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
376        DataType::Null => hash_null(random_state, hashes_buffer, rehash),
377        DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash),
378        DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash),
379        DataType::Utf8View => hash_array(&as_string_view_array(array)?, random_state, hashes_buffer, rehash),
380        DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash),
381        DataType::Binary => hash_array(&as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
382        DataType::BinaryView => hash_array(&as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
383        DataType::LargeBinary => hash_array(&as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
384        DataType::FixedSizeBinary(_) => {
385            let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
386            hash_array(&array, random_state, hashes_buffer, rehash)
387        }
388        DataType::Dictionary(_, _) => downcast_dictionary_array! {
389            array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
390            _ => unreachable!()
391        }
392        DataType::Struct(_) => {
393            let array = as_struct_array(array)?;
394            hash_struct_array(array, random_state, hashes_buffer)?;
395        }
396        DataType::List(_) => {
397            let array = as_list_array(array)?;
398            hash_list_array(array, random_state, hashes_buffer)?;
399        }
400        DataType::LargeList(_) => {
401            let array = as_large_list_array(array)?;
402            hash_list_array(array, random_state, hashes_buffer)?;
403        }
404        DataType::Map(_, _) => {
405            let array = as_map_array(array)?;
406            hash_map_array(array, random_state, hashes_buffer)?;
407        }
408        DataType::FixedSizeList(_,_) => {
409            let array = as_fixed_size_list_array(array)?;
410            hash_fixed_list_array(array, random_state, hashes_buffer)?;
411        }
412        _ => {
413            // This is internal because we should have caught this before.
414            return _internal_err!(
415                "Unsupported data type in hasher: {}",
416                array.data_type()
417            );
418        }
419    }
420    Ok(())
421}
422
423/// Test version of `hash_single_array` that forces all hashes to collide to zero.
424#[cfg(feature = "force_hash_collisions")]
425fn hash_single_array(
426    _array: &dyn Array,
427    _random_state: &RandomState,
428    hashes_buffer: &mut [u64],
429    _rehash: bool,
430) -> Result<()> {
431    for hash in hashes_buffer.iter_mut() {
432        *hash = 0
433    }
434    Ok(())
435}
436
437/// Something that can be returned as a `&dyn Array`.
438///
439/// We want `create_hashes` to accept either `&dyn Array` or `ArrayRef`,
440/// and this seems the best way to do so.
441///
442/// We tried having it accept `AsRef<dyn Array>`
443/// but that is not implemented for and cannot be implemented for
444/// `&dyn Array` so callers that have the latter would not be able
445/// to call `create_hashes` directly. This shim trait makes it possible.
446pub trait AsDynArray {
447    fn as_dyn_array(&self) -> &dyn Array;
448}
449
450impl AsDynArray for dyn Array {
451    fn as_dyn_array(&self) -> &dyn Array {
452        self
453    }
454}
455
456impl AsDynArray for &dyn Array {
457    fn as_dyn_array(&self) -> &dyn Array {
458        *self
459    }
460}
461
462impl AsDynArray for ArrayRef {
463    fn as_dyn_array(&self) -> &dyn Array {
464        self.as_ref()
465    }
466}
467
468impl AsDynArray for &ArrayRef {
469    fn as_dyn_array(&self) -> &dyn Array {
470        self.as_ref()
471    }
472}
473
474/// Creates hash values for every row, based on the values in the columns.
475///
476/// The number of rows to hash is determined by `hashes_buffer.len()`.
477/// `hashes_buffer` should be pre-sized appropriately.
478pub fn create_hashes<'a, I, T>(
479    arrays: I,
480    random_state: &RandomState,
481    hashes_buffer: &'a mut Vec<u64>,
482) -> Result<&'a mut Vec<u64>>
483where
484    I: IntoIterator<Item = T>,
485    T: AsDynArray,
486{
487    for (i, array) in arrays.into_iter().enumerate() {
488        // combine hashes with `combine_hashes` for all columns besides the first
489        let rehash = i >= 1;
490        hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?;
491    }
492    Ok(hashes_buffer)
493}
494
495#[cfg(test)]
496mod tests {
497    use std::sync::Arc;
498
499    use arrow::array::*;
500    #[cfg(not(feature = "force_hash_collisions"))]
501    use arrow::datatypes::*;
502
503    use super::*;
504
505    #[test]
506    fn create_hashes_for_decimal_array() -> Result<()> {
507        let array = vec![1, 2, 3, 4]
508            .into_iter()
509            .map(Some)
510            .collect::<Decimal128Array>()
511            .with_precision_and_scale(20, 3)
512            .unwrap();
513        let array_ref: ArrayRef = Arc::new(array);
514        let random_state = RandomState::with_seeds(0, 0, 0, 0);
515        let hashes_buff = &mut vec![0; array_ref.len()];
516        let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
517        assert_eq!(hashes.len(), 4);
518        Ok(())
519    }
520
521    #[test]
522    fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
523        let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
524        let random_state = RandomState::with_seeds(0, 0, 0, 0);
525        let hashes_buff = &mut vec![0; 0];
526        let hashes = create_hashes(
527            &[Arc::new(empty_array) as ArrayRef],
528            &random_state,
529            hashes_buff,
530        )?;
531        assert_eq!(hashes, &Vec::<u64>::new());
532        Ok(())
533    }
534
535    #[test]
536    fn create_hashes_for_float_arrays() -> Result<()> {
537        let f32_arr: ArrayRef =
538            Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
539        let f64_arr: ArrayRef =
540            Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
541
542        let random_state = RandomState::with_seeds(0, 0, 0, 0);
543        let hashes_buff = &mut vec![0; f32_arr.len()];
544        let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
545        assert_eq!(hashes.len(), 4,);
546
547        let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?;
548        assert_eq!(hashes.len(), 4,);
549
550        Ok(())
551    }
552
553    macro_rules! create_hash_binary {
554        ($NAME:ident, $ARRAY:ty) => {
555            #[cfg(not(feature = "force_hash_collisions"))]
556            #[test]
557            fn $NAME() {
558                let binary = [
559                    Some(b"short".to_byte_slice()),
560                    None,
561                    Some(b"long but different 12 bytes string"),
562                    Some(b"short2"),
563                    Some(b"Longer than 12 bytes string"),
564                    Some(b"short"),
565                    Some(b"Longer than 12 bytes string"),
566                ];
567
568                let binary_array: ArrayRef =
569                    Arc::new(binary.iter().cloned().collect::<$ARRAY>());
570                let ref_array: ArrayRef =
571                    Arc::new(binary.iter().cloned().collect::<BinaryArray>());
572
573                let random_state = RandomState::with_seeds(0, 0, 0, 0);
574
575                let mut binary_hashes = vec![0; binary.len()];
576                create_hashes(&[binary_array], &random_state, &mut binary_hashes)
577                    .unwrap();
578
579                let mut ref_hashes = vec![0; binary.len()];
580                create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
581
582                // Null values result in a zero hash,
583                for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
584                    match val {
585                        Some(_) => assert_ne!(*hash, 0),
586                        None => assert_eq!(*hash, 0),
587                    }
588                }
589
590                // same logical values should hash to the same hash value
591                assert_eq!(binary_hashes, ref_hashes);
592
593                // Same values should map to same hash values
594                assert_eq!(binary[0], binary[5]);
595                assert_eq!(binary[4], binary[6]);
596
597                // different binary should map to different hash values
598                assert_ne!(binary[0], binary[2]);
599            }
600        };
601    }
602
603    create_hash_binary!(binary_array, BinaryArray);
604    create_hash_binary!(binary_view_array, BinaryViewArray);
605
606    #[test]
607    fn create_hashes_fixed_size_binary() -> Result<()> {
608        let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
609        let fixed_size_binary_array: ArrayRef =
610            Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
611
612        let random_state = RandomState::with_seeds(0, 0, 0, 0);
613        let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
614        let hashes =
615            create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
616        assert_eq!(hashes.len(), 3,);
617
618        Ok(())
619    }
620
621    macro_rules! create_hash_string {
622        ($NAME:ident, $ARRAY:ty) => {
623            #[cfg(not(feature = "force_hash_collisions"))]
624            #[test]
625            fn $NAME() {
626                let strings = [
627                    Some("short"),
628                    None,
629                    Some("long but different 12 bytes string"),
630                    Some("short2"),
631                    Some("Longer than 12 bytes string"),
632                    Some("short"),
633                    Some("Longer than 12 bytes string"),
634                ];
635
636                let string_array: ArrayRef =
637                    Arc::new(strings.iter().cloned().collect::<$ARRAY>());
638                let dict_array: ArrayRef = Arc::new(
639                    strings
640                        .iter()
641                        .cloned()
642                        .collect::<DictionaryArray<Int8Type>>(),
643                );
644
645                let random_state = RandomState::with_seeds(0, 0, 0, 0);
646
647                let mut string_hashes = vec![0; strings.len()];
648                create_hashes(&[string_array], &random_state, &mut string_hashes)
649                    .unwrap();
650
651                let mut dict_hashes = vec![0; strings.len()];
652                create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
653
654                // Null values result in a zero hash,
655                for (val, hash) in strings.iter().zip(string_hashes.iter()) {
656                    match val {
657                        Some(_) => assert_ne!(*hash, 0),
658                        None => assert_eq!(*hash, 0),
659                    }
660                }
661
662                // same logical values should hash to the same hash value
663                assert_eq!(string_hashes, dict_hashes);
664
665                // Same values should map to same hash values
666                assert_eq!(strings[0], strings[5]);
667                assert_eq!(strings[4], strings[6]);
668
669                // different strings should map to different hash values
670                assert_ne!(strings[0], strings[2]);
671            }
672        };
673    }
674
675    create_hash_string!(string_array, StringArray);
676    create_hash_string!(large_string_array, LargeStringArray);
677    create_hash_string!(string_view_array, StringArray);
678    create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
679
680    #[test]
681    // Tests actual values of hashes, which are different if forcing collisions
682    #[cfg(not(feature = "force_hash_collisions"))]
683    fn create_hashes_for_dict_arrays() {
684        let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
685
686        let string_array: ArrayRef =
687            Arc::new(strings.iter().cloned().collect::<StringArray>());
688        let dict_array: ArrayRef = Arc::new(
689            strings
690                .iter()
691                .cloned()
692                .collect::<DictionaryArray<Int8Type>>(),
693        );
694
695        let random_state = RandomState::with_seeds(0, 0, 0, 0);
696
697        let mut string_hashes = vec![0; strings.len()];
698        create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
699
700        let mut dict_hashes = vec![0; strings.len()];
701        create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
702
703        // Null values result in a zero hash,
704        for (val, hash) in strings.iter().zip(string_hashes.iter()) {
705            match val {
706                Some(_) => assert_ne!(*hash, 0),
707                None => assert_eq!(*hash, 0),
708            }
709        }
710
711        // same logical values should hash to the same hash value
712        assert_eq!(string_hashes, dict_hashes);
713
714        // Same values should map to same hash values
715        assert_eq!(strings[1], strings[4]);
716        assert_eq!(dict_hashes[1], dict_hashes[4]);
717        assert_eq!(strings[0], strings[3]);
718        assert_eq!(dict_hashes[0], dict_hashes[3]);
719
720        // different strings should map to different hash values
721        assert_ne!(strings[0], strings[2]);
722        assert_ne!(dict_hashes[0], dict_hashes[2]);
723    }
724
725    #[test]
726    // Tests actual values of hashes, which are different if forcing collisions
727    #[cfg(not(feature = "force_hash_collisions"))]
728    fn create_hashes_for_list_arrays() {
729        let data = vec![
730            Some(vec![Some(0), Some(1), Some(2)]),
731            None,
732            Some(vec![Some(3), None, Some(5)]),
733            Some(vec![Some(3), None, Some(5)]),
734            None,
735            Some(vec![Some(0), Some(1), Some(2)]),
736            Some(vec![]),
737        ];
738        let list_array =
739            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
740        let random_state = RandomState::with_seeds(0, 0, 0, 0);
741        let mut hashes = vec![0; list_array.len()];
742        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
743        assert_eq!(hashes[0], hashes[5]);
744        assert_eq!(hashes[1], hashes[4]);
745        assert_eq!(hashes[2], hashes[3]);
746        assert_eq!(hashes[1], hashes[6]); // null vs empty list
747    }
748
749    #[test]
750    // Tests actual values of hashes, which are different if forcing collisions
751    #[cfg(not(feature = "force_hash_collisions"))]
752    fn create_hashes_for_fixed_size_list_arrays() {
753        let data = vec![
754            Some(vec![Some(0), Some(1), Some(2)]),
755            None,
756            Some(vec![Some(3), None, Some(5)]),
757            Some(vec![Some(3), None, Some(5)]),
758            None,
759            Some(vec![Some(0), Some(1), Some(2)]),
760        ];
761        let list_array =
762            Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
763                data, 3,
764            )) as ArrayRef;
765        let random_state = RandomState::with_seeds(0, 0, 0, 0);
766        let mut hashes = vec![0; list_array.len()];
767        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
768        assert_eq!(hashes[0], hashes[5]);
769        assert_eq!(hashes[1], hashes[4]);
770        assert_eq!(hashes[2], hashes[3]);
771    }
772
773    #[test]
774    // Tests actual values of hashes, which are different if forcing collisions
775    #[cfg(not(feature = "force_hash_collisions"))]
776    fn create_hashes_for_struct_arrays() {
777        use arrow::buffer::Buffer;
778
779        let boolarr = Arc::new(BooleanArray::from(vec![
780            false, false, true, true, true, true,
781        ]));
782        let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31]));
783
784        let struct_array = StructArray::from((
785            vec![
786                (
787                    Arc::new(Field::new("bool", DataType::Boolean, false)),
788                    Arc::clone(&boolarr) as ArrayRef,
789                ),
790                (
791                    Arc::new(Field::new("i32", DataType::Int32, false)),
792                    Arc::clone(&i32arr) as ArrayRef,
793                ),
794                (
795                    Arc::new(Field::new("i32", DataType::Int32, false)),
796                    Arc::clone(&i32arr) as ArrayRef,
797                ),
798                (
799                    Arc::new(Field::new("bool", DataType::Boolean, false)),
800                    Arc::clone(&boolarr) as ArrayRef,
801                ),
802            ],
803            Buffer::from(&[0b001011]),
804        ));
805
806        assert!(struct_array.is_valid(0));
807        assert!(struct_array.is_valid(1));
808        assert!(struct_array.is_null(2));
809        assert!(struct_array.is_valid(3));
810        assert!(struct_array.is_null(4));
811        assert!(struct_array.is_null(5));
812
813        let array = Arc::new(struct_array) as ArrayRef;
814
815        let random_state = RandomState::with_seeds(0, 0, 0, 0);
816        let mut hashes = vec![0; array.len()];
817        create_hashes(&[array], &random_state, &mut hashes).unwrap();
818        assert_eq!(hashes[0], hashes[1]);
819        // same value but the third row ( hashes[2] ) is null
820        assert_ne!(hashes[2], hashes[3]);
821        // different values but both are null
822        assert_eq!(hashes[4], hashes[5]);
823    }
824
825    #[test]
826    // Tests actual values of hashes, which are different if forcing collisions
827    #[cfg(not(feature = "force_hash_collisions"))]
828    fn create_hashes_for_struct_arrays_more_column_than_row() {
829        let struct_array = StructArray::from(vec![
830            (
831                Arc::new(Field::new("bool", DataType::Boolean, false)),
832                Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
833            ),
834            (
835                Arc::new(Field::new("i32-1", DataType::Int32, false)),
836                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
837            ),
838            (
839                Arc::new(Field::new("i32-2", DataType::Int32, false)),
840                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
841            ),
842            (
843                Arc::new(Field::new("i32-3", DataType::Int32, false)),
844                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
845            ),
846        ]);
847
848        assert!(struct_array.is_valid(0));
849        assert!(struct_array.is_valid(1));
850
851        let array = Arc::new(struct_array) as ArrayRef;
852        let random_state = RandomState::with_seeds(0, 0, 0, 0);
853        let mut hashes = vec![0; array.len()];
854        create_hashes(&[array], &random_state, &mut hashes).unwrap();
855        assert_eq!(hashes[0], hashes[1]);
856    }
857
858    #[test]
859    // Tests actual values of hashes, which are different if forcing collisions
860    #[cfg(not(feature = "force_hash_collisions"))]
861    fn create_hashes_for_map_arrays() {
862        let mut builder =
863            MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
864        // Row 0
865        builder.keys().append_value("key1");
866        builder.keys().append_value("key2");
867        builder.values().append_value(1);
868        builder.values().append_value(2);
869        builder.append(true).unwrap();
870        // Row 1
871        builder.keys().append_value("key1");
872        builder.keys().append_value("key2");
873        builder.values().append_value(1);
874        builder.values().append_value(2);
875        builder.append(true).unwrap();
876        // Row 2
877        builder.keys().append_value("key1");
878        builder.keys().append_value("key2");
879        builder.values().append_value(1);
880        builder.values().append_value(3);
881        builder.append(true).unwrap();
882        // Row 3
883        builder.keys().append_value("key1");
884        builder.keys().append_value("key3");
885        builder.values().append_value(1);
886        builder.values().append_value(2);
887        builder.append(true).unwrap();
888        // Row 4
889        builder.keys().append_value("key1");
890        builder.values().append_value(1);
891        builder.append(true).unwrap();
892        // Row 5
893        builder.keys().append_value("key1");
894        builder.values().append_null();
895        builder.append(true).unwrap();
896        // Row 6
897        builder.append(true).unwrap();
898        // Row 7
899        builder.keys().append_value("key1");
900        builder.values().append_value(1);
901        builder.append(false).unwrap();
902
903        let array = Arc::new(builder.finish()) as ArrayRef;
904
905        let random_state = RandomState::with_seeds(0, 0, 0, 0);
906        let mut hashes = vec![0; array.len()];
907        create_hashes(&[array], &random_state, &mut hashes).unwrap();
908        assert_eq!(hashes[0], hashes[1]); // same value
909        assert_ne!(hashes[0], hashes[2]); // different value
910        assert_ne!(hashes[0], hashes[3]); // different key
911        assert_ne!(hashes[0], hashes[4]); // missing an entry
912        assert_ne!(hashes[4], hashes[5]); // filled vs null value
913        assert_eq!(hashes[6], hashes[7]); // empty vs null map
914    }
915
916    #[test]
917    // Tests actual values of hashes, which are different if forcing collisions
918    #[cfg(not(feature = "force_hash_collisions"))]
919    fn create_multi_column_hash_for_dict_arrays() {
920        let strings1 = [Some("foo"), None, Some("bar")];
921        let strings2 = [Some("blarg"), Some("blah"), None];
922
923        let string_array: ArrayRef =
924            Arc::new(strings1.iter().cloned().collect::<StringArray>());
925        let dict_array: ArrayRef = Arc::new(
926            strings2
927                .iter()
928                .cloned()
929                .collect::<DictionaryArray<Int32Type>>(),
930        );
931
932        let random_state = RandomState::with_seeds(0, 0, 0, 0);
933
934        let mut one_col_hashes = vec![0; strings1.len()];
935        create_hashes(
936            &[Arc::clone(&dict_array) as ArrayRef],
937            &random_state,
938            &mut one_col_hashes,
939        )
940        .unwrap();
941
942        let mut two_col_hashes = vec![0; strings1.len()];
943        create_hashes(
944            &[dict_array, string_array],
945            &random_state,
946            &mut two_col_hashes,
947        )
948        .unwrap();
949
950        assert_eq!(one_col_hashes.len(), 3);
951        assert_eq!(two_col_hashes.len(), 3);
952
953        assert_ne!(one_col_hashes, two_col_hashes);
954    }
955
956    #[test]
957    fn test_create_hashes_from_arrays() {
958        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
959        let float_array: ArrayRef =
960            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
961
962        let random_state = RandomState::with_seeds(0, 0, 0, 0);
963        let hashes_buff = &mut vec![0; int_array.len()];
964        let hashes =
965            create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap();
966        assert_eq!(hashes.len(), 4,);
967    }
968
969    #[test]
970    fn test_create_hashes_from_dyn_arrays() {
971        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
972        let float_array: ArrayRef =
973            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
974
975        // Verify that we can call create_hashes with only &dyn Array
976        fn test(arr1: &dyn Array, arr2: &dyn Array) {
977            let random_state = RandomState::with_seeds(0, 0, 0, 0);
978            let hashes_buff = &mut vec![0; arr1.len()];
979            let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap();
980            assert_eq!(hashes.len(), 4,);
981        }
982        test(&*int_array, &*float_array);
983    }
984
985    #[test]
986    fn test_create_hashes_equivalence() {
987        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
988        let random_state = RandomState::with_seeds(0, 0, 0, 0);
989
990        let mut hashes1 = vec![0; array.len()];
991        create_hashes(
992            &[Arc::clone(&array) as ArrayRef],
993            &random_state,
994            &mut hashes1,
995        )
996        .unwrap();
997
998        let mut hashes2 = vec![0; array.len()];
999        create_hashes([array], &random_state, &mut hashes2).unwrap();
1000
1001        assert_eq!(hashes1, hashes2);
1002    }
1003}