datafusion_common/
hash_utils.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Functionality used both on logical and physical plans
19
20#[cfg(not(feature = "force_hash_collisions"))]
21use std::sync::Arc;
22
23use ahash::RandomState;
24use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
25use arrow::array::*;
26use arrow::datatypes::*;
27#[cfg(not(feature = "force_hash_collisions"))]
28use arrow::{downcast_dictionary_array, downcast_primitive_array};
29
30#[cfg(not(feature = "force_hash_collisions"))]
31use crate::cast::{
32    as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
33    as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
34    as_string_array, as_string_view_array, as_struct_array,
35};
36use crate::error::Result;
37#[cfg(not(feature = "force_hash_collisions"))]
38use crate::error::_internal_err;
39
40// Combines two hashes into one hash
41#[inline]
42pub fn combine_hashes(l: u64, r: u64) -> u64 {
43    let hash = (17 * 37u64).wrapping_add(l);
44    hash.wrapping_mul(37).wrapping_add(r)
45}
46
47#[cfg(not(feature = "force_hash_collisions"))]
48fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
49    if mul_col {
50        hashes_buffer.iter_mut().for_each(|hash| {
51            // stable hash for null value
52            *hash = combine_hashes(random_state.hash_one(1), *hash);
53        })
54    } else {
55        hashes_buffer.iter_mut().for_each(|hash| {
56            *hash = random_state.hash_one(1);
57        })
58    }
59}
60
61pub trait HashValue {
62    fn hash_one(&self, state: &RandomState) -> u64;
63}
64
65impl<T: HashValue + ?Sized> HashValue for &T {
66    fn hash_one(&self, state: &RandomState) -> u64 {
67        T::hash_one(self, state)
68    }
69}
70
71macro_rules! hash_value {
72    ($($t:ty),+) => {
73        $(impl HashValue for $t {
74            fn hash_one(&self, state: &RandomState) -> u64 {
75                state.hash_one(self)
76            }
77        })+
78    };
79}
80hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
81hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
82
83macro_rules! hash_float_value {
84    ($(($t:ty, $i:ty)),+) => {
85        $(impl HashValue for $t {
86            fn hash_one(&self, state: &RandomState) -> u64 {
87                state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
88            }
89        })+
90    };
91}
92hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
93
94/// Builds hash values of PrimitiveArray and writes them into `hashes_buffer`
95/// If `rehash==true` this combines the previous hash value in the buffer
96/// with the new hash using `combine_hashes`
97#[cfg(not(feature = "force_hash_collisions"))]
98fn hash_array_primitive<T>(
99    array: &PrimitiveArray<T>,
100    random_state: &RandomState,
101    hashes_buffer: &mut [u64],
102    rehash: bool,
103) where
104    T: ArrowPrimitiveType<Native: HashValue>,
105{
106    assert_eq!(
107        hashes_buffer.len(),
108        array.len(),
109        "hashes_buffer and array should be of equal length"
110    );
111
112    if array.null_count() == 0 {
113        if rehash {
114            for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
115                *hash = combine_hashes(value.hash_one(random_state), *hash);
116            }
117        } else {
118            for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
119                *hash = value.hash_one(random_state);
120            }
121        }
122    } else if rehash {
123        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
124            if !array.is_null(i) {
125                let value = unsafe { array.value_unchecked(i) };
126                *hash = combine_hashes(value.hash_one(random_state), *hash);
127            }
128        }
129    } else {
130        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
131            if !array.is_null(i) {
132                let value = unsafe { array.value_unchecked(i) };
133                *hash = value.hash_one(random_state);
134            }
135        }
136    }
137}
138
139/// Hashes one array into the `hashes_buffer`
140/// If `rehash==true` this combines the previous hash value in the buffer
141/// with the new hash using `combine_hashes`
142#[cfg(not(feature = "force_hash_collisions"))]
143fn hash_array<T>(
144    array: T,
145    random_state: &RandomState,
146    hashes_buffer: &mut [u64],
147    rehash: bool,
148) where
149    T: ArrayAccessor,
150    T::Item: HashValue,
151{
152    assert_eq!(
153        hashes_buffer.len(),
154        array.len(),
155        "hashes_buffer and array should be of equal length"
156    );
157
158    if array.null_count() == 0 {
159        if rehash {
160            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
161                let value = unsafe { array.value_unchecked(i) };
162                *hash = combine_hashes(value.hash_one(random_state), *hash);
163            }
164        } else {
165            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
166                let value = unsafe { array.value_unchecked(i) };
167                *hash = value.hash_one(random_state);
168            }
169        }
170    } else if rehash {
171        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
172            if !array.is_null(i) {
173                let value = unsafe { array.value_unchecked(i) };
174                *hash = combine_hashes(value.hash_one(random_state), *hash);
175            }
176        }
177    } else {
178        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
179            if !array.is_null(i) {
180                let value = unsafe { array.value_unchecked(i) };
181                *hash = value.hash_one(random_state);
182            }
183        }
184    }
185}
186
187/// Helper function to update hash for a dictionary key if the value is valid
188#[cfg(not(feature = "force_hash_collisions"))]
189#[inline]
190fn update_hash_for_dict_key(
191    hash: &mut u64,
192    dict_hashes: &[u64],
193    dict_values: &dyn Array,
194    idx: usize,
195    multi_col: bool,
196) {
197    if dict_values.is_valid(idx) {
198        if multi_col {
199            *hash = combine_hashes(dict_hashes[idx], *hash);
200        } else {
201            *hash = dict_hashes[idx];
202        }
203    }
204    // no update for invalid dictionary value
205}
206
207/// Hash the values in a dictionary array
208#[cfg(not(feature = "force_hash_collisions"))]
209fn hash_dictionary<K: ArrowDictionaryKeyType>(
210    array: &DictionaryArray<K>,
211    random_state: &RandomState,
212    hashes_buffer: &mut [u64],
213    multi_col: bool,
214) -> Result<()> {
215    // Hash each dictionary value once, and then use that computed
216    // hash for each key value to avoid a potentially expensive
217    // redundant hashing for large dictionary elements (e.g. strings)
218    let dict_values = Arc::clone(array.values());
219    let mut dict_hashes = vec![0; dict_values.len()];
220    create_hashes(&[dict_values], random_state, &mut dict_hashes)?;
221
222    // combine hash for each index in values
223    let dict_values = array.values();
224    for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
225        if let Some(key) = key {
226            let idx = key.as_usize();
227            update_hash_for_dict_key(
228                hash,
229                &dict_hashes,
230                dict_values.as_ref(),
231                idx,
232                multi_col,
233            );
234        } // no update for Null key
235    }
236    Ok(())
237}
238
239#[cfg(not(feature = "force_hash_collisions"))]
240fn hash_struct_array(
241    array: &StructArray,
242    random_state: &RandomState,
243    hashes_buffer: &mut [u64],
244) -> Result<()> {
245    let nulls = array.nulls();
246    let row_len = array.len();
247
248    let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
249        nulls.valid_indices().collect()
250    } else {
251        (0..row_len).collect()
252    };
253
254    // Create hashes for each row that combines the hashes over all the column at that row.
255    let mut values_hashes = vec![0u64; row_len];
256    create_hashes(array.columns(), random_state, &mut values_hashes)?;
257
258    for i in valid_row_indices {
259        let hash = &mut hashes_buffer[i];
260        *hash = combine_hashes(*hash, values_hashes[i]);
261    }
262
263    Ok(())
264}
265
266// only adding this `cfg` b/c this function is only used with this `cfg`
267#[cfg(not(feature = "force_hash_collisions"))]
268fn hash_map_array(
269    array: &MapArray,
270    random_state: &RandomState,
271    hashes_buffer: &mut [u64],
272) -> Result<()> {
273    let nulls = array.nulls();
274    let offsets = array.offsets();
275
276    // Create hashes for each entry in each row
277    let mut values_hashes = vec![0u64; array.entries().len()];
278    create_hashes(array.entries().columns(), random_state, &mut values_hashes)?;
279
280    // Combine the hashes for entries on each row with each other and previous hash for that row
281    if let Some(nulls) = nulls {
282        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
283            if nulls.is_valid(i) {
284                let hash = &mut hashes_buffer[i];
285                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
286                    *hash = combine_hashes(*hash, *values_hash);
287                }
288            }
289        }
290    } else {
291        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
292            let hash = &mut hashes_buffer[i];
293            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
294                *hash = combine_hashes(*hash, *values_hash);
295            }
296        }
297    }
298
299    Ok(())
300}
301
302#[cfg(not(feature = "force_hash_collisions"))]
303fn hash_list_array<OffsetSize>(
304    array: &GenericListArray<OffsetSize>,
305    random_state: &RandomState,
306    hashes_buffer: &mut [u64],
307) -> Result<()>
308where
309    OffsetSize: OffsetSizeTrait,
310{
311    let values = Arc::clone(array.values());
312    let offsets = array.value_offsets();
313    let nulls = array.nulls();
314    let mut values_hashes = vec![0u64; values.len()];
315    create_hashes(&[values], random_state, &mut values_hashes)?;
316    if let Some(nulls) = nulls {
317        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
318            if nulls.is_valid(i) {
319                let hash = &mut hashes_buffer[i];
320                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
321                    *hash = combine_hashes(*hash, *values_hash);
322                }
323            }
324        }
325    } else {
326        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
327            let hash = &mut hashes_buffer[i];
328            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
329                *hash = combine_hashes(*hash, *values_hash);
330            }
331        }
332    }
333    Ok(())
334}
335
336#[cfg(not(feature = "force_hash_collisions"))]
337fn hash_fixed_list_array(
338    array: &FixedSizeListArray,
339    random_state: &RandomState,
340    hashes_buffer: &mut [u64],
341) -> Result<()> {
342    let values = Arc::clone(array.values());
343    let value_length = array.value_length() as usize;
344    let nulls = array.nulls();
345    let mut values_hashes = vec![0u64; values.len()];
346    create_hashes(&[values], random_state, &mut values_hashes)?;
347    if let Some(nulls) = nulls {
348        for i in 0..array.len() {
349            if nulls.is_valid(i) {
350                let hash = &mut hashes_buffer[i];
351                for values_hash in
352                    &values_hashes[i * value_length..(i + 1) * value_length]
353                {
354                    *hash = combine_hashes(*hash, *values_hash);
355                }
356            }
357        }
358    } else {
359        for i in 0..array.len() {
360            let hash = &mut hashes_buffer[i];
361            for values_hash in &values_hashes[i * value_length..(i + 1) * value_length] {
362                *hash = combine_hashes(*hash, *values_hash);
363            }
364        }
365    }
366    Ok(())
367}
368
369/// Test version of `create_hashes` that produces the same value for
370/// all hashes (to test collisions)
371///
372/// See comments on `hashes_buffer` for more details
373#[cfg(feature = "force_hash_collisions")]
374pub fn create_hashes<'a>(
375    _arrays: &[ArrayRef],
376    _random_state: &RandomState,
377    hashes_buffer: &'a mut Vec<u64>,
378) -> Result<&'a mut Vec<u64>> {
379    for hash in hashes_buffer.iter_mut() {
380        *hash = 0
381    }
382    Ok(hashes_buffer)
383}
384
385/// Creates hash values for every row, based on the values in the
386/// columns.
387///
388/// The number of rows to hash is determined by `hashes_buffer.len()`.
389/// `hashes_buffer` should be pre-sized appropriately
390#[cfg(not(feature = "force_hash_collisions"))]
391pub fn create_hashes<'a>(
392    arrays: &[ArrayRef],
393    random_state: &RandomState,
394    hashes_buffer: &'a mut Vec<u64>,
395) -> Result<&'a mut Vec<u64>> {
396    for (i, col) in arrays.iter().enumerate() {
397        let array = col.as_ref();
398        // combine hashes with `combine_hashes` for all columns besides the first
399        let rehash = i >= 1;
400        downcast_primitive_array! {
401            array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
402            DataType::Null => hash_null(random_state, hashes_buffer, rehash),
403            DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash),
404            DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, rehash),
405            DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
406            DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash),
407            DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
408            DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
409            DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
410            DataType::FixedSizeBinary(_) => {
411                let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
412                hash_array(array, random_state, hashes_buffer, rehash)
413            }
414            DataType::Dictionary(_, _) => downcast_dictionary_array! {
415                array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
416                _ => unreachable!()
417            }
418            DataType::Struct(_) => {
419                let array = as_struct_array(array)?;
420                hash_struct_array(array, random_state, hashes_buffer)?;
421            }
422            DataType::List(_) => {
423                let array = as_list_array(array)?;
424                hash_list_array(array, random_state, hashes_buffer)?;
425            }
426            DataType::LargeList(_) => {
427                let array = as_large_list_array(array)?;
428                hash_list_array(array, random_state, hashes_buffer)?;
429            }
430            DataType::Map(_, _) => {
431                let array = as_map_array(array)?;
432                hash_map_array(array, random_state, hashes_buffer)?;
433            }
434            DataType::FixedSizeList(_,_) => {
435                let array = as_fixed_size_list_array(array)?;
436                hash_fixed_list_array(array, random_state, hashes_buffer)?;
437            }
438            _ => {
439                // This is internal because we should have caught this before.
440                return _internal_err!(
441                    "Unsupported data type in hasher: {}",
442                    col.data_type()
443                );
444            }
445        }
446    }
447    Ok(hashes_buffer)
448}
449
450#[cfg(test)]
451mod tests {
452    use std::sync::Arc;
453
454    use arrow::array::*;
455    #[cfg(not(feature = "force_hash_collisions"))]
456    use arrow::datatypes::*;
457
458    use super::*;
459
460    #[test]
461    fn create_hashes_for_decimal_array() -> Result<()> {
462        let array = vec![1, 2, 3, 4]
463            .into_iter()
464            .map(Some)
465            .collect::<Decimal128Array>()
466            .with_precision_and_scale(20, 3)
467            .unwrap();
468        let array_ref = Arc::new(array);
469        let random_state = RandomState::with_seeds(0, 0, 0, 0);
470        let hashes_buff = &mut vec![0; array_ref.len()];
471        let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
472        assert_eq!(hashes.len(), 4);
473        Ok(())
474    }
475
476    #[test]
477    fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
478        let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
479        let random_state = RandomState::with_seeds(0, 0, 0, 0);
480        let hashes_buff = &mut vec![0; 0];
481        let hashes = create_hashes(&[Arc::new(empty_array)], &random_state, hashes_buff)?;
482        assert_eq!(hashes, &Vec::<u64>::new());
483        Ok(())
484    }
485
486    #[test]
487    fn create_hashes_for_float_arrays() -> Result<()> {
488        let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
489        let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
490
491        let random_state = RandomState::with_seeds(0, 0, 0, 0);
492        let hashes_buff = &mut vec![0; f32_arr.len()];
493        let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
494        assert_eq!(hashes.len(), 4,);
495
496        let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?;
497        assert_eq!(hashes.len(), 4,);
498
499        Ok(())
500    }
501
502    macro_rules! create_hash_binary {
503        ($NAME:ident, $ARRAY:ty) => {
504            #[cfg(not(feature = "force_hash_collisions"))]
505            #[test]
506            fn $NAME() {
507                let binary = [
508                    Some(b"short".to_byte_slice()),
509                    None,
510                    Some(b"long but different 12 bytes string"),
511                    Some(b"short2"),
512                    Some(b"Longer than 12 bytes string"),
513                    Some(b"short"),
514                    Some(b"Longer than 12 bytes string"),
515                ];
516
517                let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>());
518                let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>());
519
520                let random_state = RandomState::with_seeds(0, 0, 0, 0);
521
522                let mut binary_hashes = vec![0; binary.len()];
523                create_hashes(&[binary_array], &random_state, &mut binary_hashes)
524                    .unwrap();
525
526                let mut ref_hashes = vec![0; binary.len()];
527                create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
528
529                // Null values result in a zero hash,
530                for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
531                    match val {
532                        Some(_) => assert_ne!(*hash, 0),
533                        None => assert_eq!(*hash, 0),
534                    }
535                }
536
537                // same logical values should hash to the same hash value
538                assert_eq!(binary_hashes, ref_hashes);
539
540                // Same values should map to same hash values
541                assert_eq!(binary[0], binary[5]);
542                assert_eq!(binary[4], binary[6]);
543
544                // different binary should map to different hash values
545                assert_ne!(binary[0], binary[2]);
546            }
547        };
548    }
549
550    create_hash_binary!(binary_array, BinaryArray);
551    create_hash_binary!(binary_view_array, BinaryViewArray);
552
553    #[test]
554    fn create_hashes_fixed_size_binary() -> Result<()> {
555        let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
556        let fixed_size_binary_array =
557            Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
558
559        let random_state = RandomState::with_seeds(0, 0, 0, 0);
560        let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
561        let hashes =
562            create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
563        assert_eq!(hashes.len(), 3,);
564
565        Ok(())
566    }
567
568    macro_rules! create_hash_string {
569        ($NAME:ident, $ARRAY:ty) => {
570            #[cfg(not(feature = "force_hash_collisions"))]
571            #[test]
572            fn $NAME() {
573                let strings = [
574                    Some("short"),
575                    None,
576                    Some("long but different 12 bytes string"),
577                    Some("short2"),
578                    Some("Longer than 12 bytes string"),
579                    Some("short"),
580                    Some("Longer than 12 bytes string"),
581                ];
582
583                let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>());
584                let dict_array = Arc::new(
585                    strings
586                        .iter()
587                        .cloned()
588                        .collect::<DictionaryArray<Int8Type>>(),
589                );
590
591                let random_state = RandomState::with_seeds(0, 0, 0, 0);
592
593                let mut string_hashes = vec![0; strings.len()];
594                create_hashes(&[string_array], &random_state, &mut string_hashes)
595                    .unwrap();
596
597                let mut dict_hashes = vec![0; strings.len()];
598                create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
599
600                // Null values result in a zero hash,
601                for (val, hash) in strings.iter().zip(string_hashes.iter()) {
602                    match val {
603                        Some(_) => assert_ne!(*hash, 0),
604                        None => assert_eq!(*hash, 0),
605                    }
606                }
607
608                // same logical values should hash to the same hash value
609                assert_eq!(string_hashes, dict_hashes);
610
611                // Same values should map to same hash values
612                assert_eq!(strings[0], strings[5]);
613                assert_eq!(strings[4], strings[6]);
614
615                // different strings should map to different hash values
616                assert_ne!(strings[0], strings[2]);
617            }
618        };
619    }
620
621    create_hash_string!(string_array, StringArray);
622    create_hash_string!(large_string_array, LargeStringArray);
623    create_hash_string!(string_view_array, StringArray);
624    create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
625
626    #[test]
627    // Tests actual values of hashes, which are different if forcing collisions
628    #[cfg(not(feature = "force_hash_collisions"))]
629    fn create_hashes_for_dict_arrays() {
630        let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
631
632        let string_array = Arc::new(strings.iter().cloned().collect::<StringArray>());
633        let dict_array = Arc::new(
634            strings
635                .iter()
636                .cloned()
637                .collect::<DictionaryArray<Int8Type>>(),
638        );
639
640        let random_state = RandomState::with_seeds(0, 0, 0, 0);
641
642        let mut string_hashes = vec![0; strings.len()];
643        create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
644
645        let mut dict_hashes = vec![0; strings.len()];
646        create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
647
648        // Null values result in a zero hash,
649        for (val, hash) in strings.iter().zip(string_hashes.iter()) {
650            match val {
651                Some(_) => assert_ne!(*hash, 0),
652                None => assert_eq!(*hash, 0),
653            }
654        }
655
656        // same logical values should hash to the same hash value
657        assert_eq!(string_hashes, dict_hashes);
658
659        // Same values should map to same hash values
660        assert_eq!(strings[1], strings[4]);
661        assert_eq!(dict_hashes[1], dict_hashes[4]);
662        assert_eq!(strings[0], strings[3]);
663        assert_eq!(dict_hashes[0], dict_hashes[3]);
664
665        // different strings should map to different hash values
666        assert_ne!(strings[0], strings[2]);
667        assert_ne!(dict_hashes[0], dict_hashes[2]);
668    }
669
670    #[test]
671    // Tests actual values of hashes, which are different if forcing collisions
672    #[cfg(not(feature = "force_hash_collisions"))]
673    fn create_hashes_for_list_arrays() {
674        let data = vec![
675            Some(vec![Some(0), Some(1), Some(2)]),
676            None,
677            Some(vec![Some(3), None, Some(5)]),
678            Some(vec![Some(3), None, Some(5)]),
679            None,
680            Some(vec![Some(0), Some(1), Some(2)]),
681            Some(vec![]),
682        ];
683        let list_array =
684            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
685        let random_state = RandomState::with_seeds(0, 0, 0, 0);
686        let mut hashes = vec![0; list_array.len()];
687        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
688        assert_eq!(hashes[0], hashes[5]);
689        assert_eq!(hashes[1], hashes[4]);
690        assert_eq!(hashes[2], hashes[3]);
691        assert_eq!(hashes[1], hashes[6]); // null vs empty list
692    }
693
694    #[test]
695    // Tests actual values of hashes, which are different if forcing collisions
696    #[cfg(not(feature = "force_hash_collisions"))]
697    fn create_hashes_for_fixed_size_list_arrays() {
698        let data = vec![
699            Some(vec![Some(0), Some(1), Some(2)]),
700            None,
701            Some(vec![Some(3), None, Some(5)]),
702            Some(vec![Some(3), None, Some(5)]),
703            None,
704            Some(vec![Some(0), Some(1), Some(2)]),
705        ];
706        let list_array =
707            Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
708                data, 3,
709            )) as ArrayRef;
710        let random_state = RandomState::with_seeds(0, 0, 0, 0);
711        let mut hashes = vec![0; list_array.len()];
712        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
713        assert_eq!(hashes[0], hashes[5]);
714        assert_eq!(hashes[1], hashes[4]);
715        assert_eq!(hashes[2], hashes[3]);
716    }
717
718    #[test]
719    // Tests actual values of hashes, which are different if forcing collisions
720    #[cfg(not(feature = "force_hash_collisions"))]
721    fn create_hashes_for_struct_arrays() {
722        use arrow::buffer::Buffer;
723
724        let boolarr = Arc::new(BooleanArray::from(vec![
725            false, false, true, true, true, true,
726        ]));
727        let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31]));
728
729        let struct_array = StructArray::from((
730            vec![
731                (
732                    Arc::new(Field::new("bool", DataType::Boolean, false)),
733                    Arc::clone(&boolarr) as ArrayRef,
734                ),
735                (
736                    Arc::new(Field::new("i32", DataType::Int32, false)),
737                    Arc::clone(&i32arr) as ArrayRef,
738                ),
739                (
740                    Arc::new(Field::new("i32", DataType::Int32, false)),
741                    Arc::clone(&i32arr) as ArrayRef,
742                ),
743                (
744                    Arc::new(Field::new("bool", DataType::Boolean, false)),
745                    Arc::clone(&boolarr) as ArrayRef,
746                ),
747            ],
748            Buffer::from(&[0b001011]),
749        ));
750
751        assert!(struct_array.is_valid(0));
752        assert!(struct_array.is_valid(1));
753        assert!(struct_array.is_null(2));
754        assert!(struct_array.is_valid(3));
755        assert!(struct_array.is_null(4));
756        assert!(struct_array.is_null(5));
757
758        let array = Arc::new(struct_array) as ArrayRef;
759
760        let random_state = RandomState::with_seeds(0, 0, 0, 0);
761        let mut hashes = vec![0; array.len()];
762        create_hashes(&[array], &random_state, &mut hashes).unwrap();
763        assert_eq!(hashes[0], hashes[1]);
764        // same value but the third row ( hashes[2] ) is null
765        assert_ne!(hashes[2], hashes[3]);
766        // different values but both are null
767        assert_eq!(hashes[4], hashes[5]);
768    }
769
770    #[test]
771    // Tests actual values of hashes, which are different if forcing collisions
772    #[cfg(not(feature = "force_hash_collisions"))]
773    fn create_hashes_for_struct_arrays_more_column_than_row() {
774        let struct_array = StructArray::from(vec![
775            (
776                Arc::new(Field::new("bool", DataType::Boolean, false)),
777                Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
778            ),
779            (
780                Arc::new(Field::new("i32-1", DataType::Int32, false)),
781                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
782            ),
783            (
784                Arc::new(Field::new("i32-2", DataType::Int32, false)),
785                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
786            ),
787            (
788                Arc::new(Field::new("i32-3", DataType::Int32, false)),
789                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
790            ),
791        ]);
792
793        assert!(struct_array.is_valid(0));
794        assert!(struct_array.is_valid(1));
795
796        let array = Arc::new(struct_array) as ArrayRef;
797        let random_state = RandomState::with_seeds(0, 0, 0, 0);
798        let mut hashes = vec![0; array.len()];
799        create_hashes(&[array], &random_state, &mut hashes).unwrap();
800        assert_eq!(hashes[0], hashes[1]);
801    }
802
803    #[test]
804    // Tests actual values of hashes, which are different if forcing collisions
805    #[cfg(not(feature = "force_hash_collisions"))]
806    fn create_hashes_for_map_arrays() {
807        let mut builder =
808            MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
809        // Row 0
810        builder.keys().append_value("key1");
811        builder.keys().append_value("key2");
812        builder.values().append_value(1);
813        builder.values().append_value(2);
814        builder.append(true).unwrap();
815        // Row 1
816        builder.keys().append_value("key1");
817        builder.keys().append_value("key2");
818        builder.values().append_value(1);
819        builder.values().append_value(2);
820        builder.append(true).unwrap();
821        // Row 2
822        builder.keys().append_value("key1");
823        builder.keys().append_value("key2");
824        builder.values().append_value(1);
825        builder.values().append_value(3);
826        builder.append(true).unwrap();
827        // Row 3
828        builder.keys().append_value("key1");
829        builder.keys().append_value("key3");
830        builder.values().append_value(1);
831        builder.values().append_value(2);
832        builder.append(true).unwrap();
833        // Row 4
834        builder.keys().append_value("key1");
835        builder.values().append_value(1);
836        builder.append(true).unwrap();
837        // Row 5
838        builder.keys().append_value("key1");
839        builder.values().append_null();
840        builder.append(true).unwrap();
841        // Row 6
842        builder.append(true).unwrap();
843        // Row 7
844        builder.keys().append_value("key1");
845        builder.values().append_value(1);
846        builder.append(false).unwrap();
847
848        let array = Arc::new(builder.finish()) as ArrayRef;
849
850        let random_state = RandomState::with_seeds(0, 0, 0, 0);
851        let mut hashes = vec![0; array.len()];
852        create_hashes(&[array], &random_state, &mut hashes).unwrap();
853        assert_eq!(hashes[0], hashes[1]); // same value
854        assert_ne!(hashes[0], hashes[2]); // different value
855        assert_ne!(hashes[0], hashes[3]); // different key
856        assert_ne!(hashes[0], hashes[4]); // missing an entry
857        assert_ne!(hashes[4], hashes[5]); // filled vs null value
858        assert_eq!(hashes[6], hashes[7]); // empty vs null map
859    }
860
861    #[test]
862    // Tests actual values of hashes, which are different if forcing collisions
863    #[cfg(not(feature = "force_hash_collisions"))]
864    fn create_multi_column_hash_for_dict_arrays() {
865        let strings1 = [Some("foo"), None, Some("bar")];
866        let strings2 = [Some("blarg"), Some("blah"), None];
867
868        let string_array = Arc::new(strings1.iter().cloned().collect::<StringArray>());
869        let dict_array = Arc::new(
870            strings2
871                .iter()
872                .cloned()
873                .collect::<DictionaryArray<Int32Type>>(),
874        );
875
876        let random_state = RandomState::with_seeds(0, 0, 0, 0);
877
878        let mut one_col_hashes = vec![0; strings1.len()];
879        create_hashes(
880            &[Arc::clone(&dict_array) as ArrayRef],
881            &random_state,
882            &mut one_col_hashes,
883        )
884        .unwrap();
885
886        let mut two_col_hashes = vec![0; strings1.len()];
887        create_hashes(
888            &[dict_array, string_array],
889            &random_state,
890            &mut two_col_hashes,
891        )
892        .unwrap();
893
894        assert_eq!(one_col_hashes.len(), 3);
895        assert_eq!(two_col_hashes.len(), 3);
896
897        assert_ne!(one_col_hashes, two_col_hashes);
898    }
899}