Skip to main content

datafusion_common/
hash_utils.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Functionality used both on logical and physical plans
19
20use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
21use arrow::array::*;
22#[cfg(not(feature = "force_hash_collisions"))]
23use arrow::compute::take;
24use arrow::datatypes::*;
25#[cfg(not(feature = "force_hash_collisions"))]
26use arrow::{downcast_dictionary_array, downcast_primitive_array};
27use foldhash::fast::FixedState;
28#[cfg(not(feature = "force_hash_collisions"))]
29use itertools::Itertools;
30#[cfg(not(feature = "force_hash_collisions"))]
31use std::collections::HashMap;
32use std::hash::{BuildHasher, Hash, Hasher};
33
34/// The hash random state used throughout DataFusion for hashing.
35pub type RandomState = FixedState;
36
37#[cfg(not(feature = "force_hash_collisions"))]
38use crate::cast::{
39    as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
40    as_generic_binary_array, as_large_list_array, as_large_list_view_array,
41    as_list_array, as_list_view_array, as_map_array, as_string_array,
42    as_string_view_array, as_struct_array, as_union_array,
43};
44use crate::error::Result;
45use crate::error::{_internal_datafusion_err, _internal_err};
46use std::cell::RefCell;
47
48// Combines two hashes into one hash
49#[inline]
50pub fn combine_hashes(l: u64, r: u64) -> u64 {
51    let hash = (17 * 37u64).wrapping_add(l);
52    hash.wrapping_mul(37).wrapping_add(r)
53}
54
55/// Maximum size for the thread-local hash buffer before truncation (4MB = 524,288 u64 elements).
56/// The goal of this is to avoid unbounded memory growth that would appear as a memory leak.
57/// We allow temporary allocations beyond this size, but after use the buffer is truncated
58/// to this size.
59const MAX_BUFFER_SIZE: usize = 524_288;
60
61thread_local! {
62    /// Thread-local buffer for hash computations to avoid repeated allocations.
63    /// The buffer is reused across calls and truncated if it exceeds MAX_BUFFER_SIZE.
64    /// Defaults to a capacity of 8192 u64 elements which is the default batch size.
65    /// This corresponds to 64KB of memory.
66    static HASH_BUFFER: RefCell<Vec<u64>> = const { RefCell::new(Vec::new()) };
67}
68
69/// Creates hashes for the given arrays using a thread-local buffer, then calls the provided callback
70/// with an immutable reference to the computed hashes.
71///
72/// This function manages a thread-local buffer to avoid repeated allocations. The buffer is automatically
73/// truncated if it exceeds `MAX_BUFFER_SIZE` after use.
74///
75/// # Arguments
76/// * `arrays` - The arrays to hash (must contain at least one array)
77/// * `random_state` - The random state for hashing
78/// * `callback` - A function that receives an immutable reference to the hash slice and returns a result
79///
80/// # Errors
81/// Returns an error if:
82/// - No arrays are provided
83/// - The function is called reentrantly (i.e., the callback invokes `with_hashes` again on the same thread)
84/// - The function is called during or after thread destruction
85///
86/// # Example
87/// ```ignore
88/// use datafusion_common::hash_utils::{with_hashes, RandomState};
89/// use arrow::array::{Int32Array, ArrayRef};
90/// use std::sync::Arc;
91///
92/// let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
93/// let random_state = RandomState::default();
94///
95/// let result = with_hashes([&array], &random_state, |hashes| {
96///     // Use the hashes here
97///     Ok(hashes.len())
98/// })?;
99/// ```
100pub fn with_hashes<I, T, F, R>(
101    arrays: I,
102    random_state: &RandomState,
103    callback: F,
104) -> Result<R>
105where
106    I: IntoIterator<Item = T>,
107    T: AsDynArray,
108    F: FnOnce(&[u64]) -> Result<R>,
109{
110    // Peek at the first array to determine buffer size without fully collecting
111    let mut iter = arrays.into_iter().peekable();
112
113    // Get the required size from the first array
114    let required_size = match iter.peek() {
115        Some(arr) => arr.as_dyn_array().len(),
116        None => return _internal_err!("with_hashes requires at least one array"),
117    };
118
119    HASH_BUFFER.try_with(|cell| {
120        let mut buffer = cell.try_borrow_mut()
121            .map_err(|_| _internal_datafusion_err!("with_hashes cannot be called reentrantly on the same thread"))?;
122
123        // Ensure buffer has sufficient length, clearing old values
124        buffer.clear();
125        buffer.resize(required_size, 0);
126
127        // Create hashes in the buffer - this consumes the iterator
128        create_hashes(iter, random_state, &mut buffer[..required_size])?;
129
130        // Execute the callback with an immutable slice
131        let result = callback(&buffer[..required_size])?;
132
133        // Cleanup: truncate if buffer grew too large
134        if buffer.capacity() > MAX_BUFFER_SIZE {
135            buffer.truncate(MAX_BUFFER_SIZE);
136            buffer.shrink_to_fit();
137        }
138
139        Ok(result)
140    }).map_err(|_| _internal_datafusion_err!("with_hashes cannot access thread-local storage during or after thread destruction"))?
141}
142
143#[cfg(not(feature = "force_hash_collisions"))]
144fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
145    if mul_col {
146        hashes_buffer.iter_mut().for_each(|hash| {
147            // stable hash for null value
148            *hash = combine_hashes(random_state.hash_one(1), *hash);
149        })
150    } else {
151        hashes_buffer.iter_mut().for_each(|hash| {
152            *hash = random_state.hash_one(1);
153        })
154    }
155}
156
157pub trait HashValue {
158    fn hash_one(&self, state: &RandomState) -> u64;
159    /// Write this value into an existing hasher (same data as `hash_one`).
160    fn hash_write(&self, hasher: &mut impl Hasher);
161}
162
163impl<T: HashValue + ?Sized> HashValue for &T {
164    fn hash_one(&self, state: &RandomState) -> u64 {
165        T::hash_one(self, state)
166    }
167    fn hash_write(&self, hasher: &mut impl Hasher) {
168        T::hash_write(self, hasher)
169    }
170}
171
172macro_rules! hash_value {
173    ($($t:ty),+) => {
174        $(impl HashValue for $t {
175            fn hash_one(&self, state: &RandomState) -> u64 {
176                state.hash_one(self)
177            }
178            fn hash_write(&self, hasher: &mut impl Hasher) {
179                Hash::hash(self, hasher)
180            }
181        })+
182    };
183}
184hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64, u128);
185hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
186
187macro_rules! hash_float_value {
188    ($(($t:ty, $i:ty)),+) => {
189        $(impl HashValue for $t {
190            fn hash_one(&self, state: &RandomState) -> u64 {
191                state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
192            }
193            fn hash_write(&self, hasher: &mut impl Hasher) {
194                hasher.write(&self.to_ne_bytes())
195            }
196        })+
197    };
198}
199hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
200
201/// Create a `SeedableRandomState` whose per-hasher seed incorporates `seed`.
202/// This folds the previous hash into the hasher's initial state so only the
203/// new value needs to pass through the hash function — same cost as `hash_one`.
204#[cfg(not(feature = "force_hash_collisions"))]
205#[inline]
206fn seeded_state(seed: u64) -> foldhash::fast::SeedableRandomState {
207    foldhash::fast::SeedableRandomState::with_seed(
208        seed,
209        foldhash::SharedSeed::global_fixed(),
210    )
211}
212
213/// Builds hash values of PrimitiveArray and writes them into `hashes_buffer`
214/// If `rehash==true` this folds the existing hash into the hasher state
215/// and hashes only the new value (avoiding a separate combine step).
216#[cfg(not(feature = "force_hash_collisions"))]
217fn hash_array_primitive<T>(
218    array: &PrimitiveArray<T>,
219    random_state: &RandomState,
220    hashes_buffer: &mut [u64],
221    rehash: bool,
222) where
223    T: ArrowPrimitiveType<Native: HashValue>,
224{
225    assert_eq!(
226        hashes_buffer.len(),
227        array.len(),
228        "hashes_buffer and array should be of equal length"
229    );
230
231    if array.null_count() == 0 {
232        if rehash {
233            for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
234                let mut hasher = seeded_state(*hash).build_hasher();
235                value.hash_write(&mut hasher);
236                *hash = hasher.finish();
237            }
238        } else {
239            for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
240                *hash = value.hash_one(random_state);
241            }
242        }
243    } else if rehash {
244        for i in array.nulls().unwrap().valid_indices() {
245            let value = unsafe { array.value_unchecked(i) };
246            let mut hasher = seeded_state(hashes_buffer[i]).build_hasher();
247            value.hash_write(&mut hasher);
248            hashes_buffer[i] = hasher.finish();
249        }
250    } else {
251        for i in array.nulls().unwrap().valid_indices() {
252            let value = unsafe { array.value_unchecked(i) };
253            hashes_buffer[i] = value.hash_one(random_state);
254        }
255    }
256}
257
258/// Hashes one array into the `hashes_buffer`
259/// If `rehash==true` this combines the previous hash value in the buffer
260/// with the new hash using `combine_hashes`
261#[cfg(not(feature = "force_hash_collisions"))]
262fn hash_array<T>(
263    array: &T,
264    random_state: &RandomState,
265    hashes_buffer: &mut [u64],
266    rehash: bool,
267) where
268    T: ArrayAccessor,
269    T::Item: HashValue,
270{
271    assert_eq!(
272        hashes_buffer.len(),
273        array.len(),
274        "hashes_buffer and array should be of equal length"
275    );
276
277    if array.null_count() == 0 {
278        if rehash {
279            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
280                let value = unsafe { array.value_unchecked(i) };
281                *hash = combine_hashes(value.hash_one(random_state), *hash);
282            }
283        } else {
284            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
285                let value = unsafe { array.value_unchecked(i) };
286                *hash = value.hash_one(random_state);
287            }
288        }
289    } else if rehash {
290        for i in array.nulls().unwrap().valid_indices() {
291            let value = unsafe { array.value_unchecked(i) };
292            hashes_buffer[i] =
293                combine_hashes(value.hash_one(random_state), hashes_buffer[i]);
294        }
295    } else {
296        for i in array.nulls().unwrap().valid_indices() {
297            let value = unsafe { array.value_unchecked(i) };
298            hashes_buffer[i] = value.hash_one(random_state);
299        }
300    }
301}
302
303/// Hash a StringView or BytesView array
304///
305/// Templated to optimize inner loop based on presence of nulls and external buffers.
306///
307/// HAS_NULLS: do we have to check null in the inner loop
308/// HAS_BUFFERS: if true, array has external buffers; if false, all strings are inlined/ less then 12 bytes
309/// REHASH: if true, combining with existing hash, otherwise initializing
310#[cfg(not(feature = "force_hash_collisions"))]
311#[inline(never)]
312fn hash_string_view_array_inner<
313    T: ByteViewType,
314    const HAS_NULLS: bool,
315    const HAS_BUFFERS: bool,
316    const REHASH: bool,
317>(
318    array: &GenericByteViewArray<T>,
319    random_state: &RandomState,
320    hashes_buffer: &mut [u64],
321) {
322    assert_eq!(
323        hashes_buffer.len(),
324        array.len(),
325        "hashes_buffer and array should be of equal length"
326    );
327
328    let buffers = array.data_buffers();
329    let view_bytes = |view_len: u32, view: u128| {
330        let view = ByteView::from(view);
331        let offset = view.offset as usize;
332        // SAFETY: view is a valid view as it came from the array
333        unsafe {
334            let data = buffers.get_unchecked(view.buffer_index as usize);
335            data.get_unchecked(offset..offset + view_len as usize)
336        }
337    };
338
339    let hashes_and_views = hashes_buffer.iter_mut().zip(array.views().iter());
340    for (i, (hash, &v)) in hashes_and_views.enumerate() {
341        if HAS_NULLS && array.is_null(i) {
342            continue;
343        }
344        let view_len = v as u32;
345        // all views are inlined, no need to access external buffers
346        if !HAS_BUFFERS || view_len <= 12 {
347            if REHASH {
348                let mut hasher = seeded_state(*hash).build_hasher();
349                v.hash_write(&mut hasher);
350                *hash = hasher.finish();
351            } else {
352                *hash = v.hash_one(random_state);
353            }
354            continue;
355        }
356        // view is not inlined, so we need to hash the bytes as well
357        let value = view_bytes(view_len, v);
358        if REHASH {
359            let mut hasher = seeded_state(*hash).build_hasher();
360            value.hash_write(&mut hasher);
361            *hash = hasher.finish();
362        } else {
363            *hash = value.hash_one(random_state);
364        }
365    }
366}
367
368/// Builds hash values for array views and writes them into `hashes_buffer`
369/// If `rehash==true` this combines the previous hash value in the buffer
370/// with the new hash using `combine_hashes`
371#[cfg(not(feature = "force_hash_collisions"))]
372fn hash_generic_byte_view_array<T: ByteViewType>(
373    array: &GenericByteViewArray<T>,
374    random_state: &RandomState,
375    hashes_buffer: &mut [u64],
376    rehash: bool,
377) {
378    // instantiate the correct version based on presence of nulls and external buffers
379    match (
380        array.null_count() != 0,
381        !array.data_buffers().is_empty(),
382        rehash,
383    ) {
384        // no nulls or buffers ==> hash the inlined views directly
385        // don't call the inner function as Rust seems better able to inline this simpler code (2-3% faster)
386        (false, false, false) => {
387            for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
388                *hash = view.hash_one(random_state);
389            }
390        }
391        (false, false, true) => {
392            for (hash, &view) in hashes_buffer.iter_mut().zip(array.views().iter()) {
393                let mut hasher = seeded_state(*hash).build_hasher();
394                view.hash_write(&mut hasher);
395                *hash = hasher.finish();
396            }
397        }
398        (false, true, false) => hash_string_view_array_inner::<T, false, true, false>(
399            array,
400            random_state,
401            hashes_buffer,
402        ),
403        (false, true, true) => hash_string_view_array_inner::<T, false, true, true>(
404            array,
405            random_state,
406            hashes_buffer,
407        ),
408        (true, false, false) => hash_string_view_array_inner::<T, true, false, false>(
409            array,
410            random_state,
411            hashes_buffer,
412        ),
413        (true, false, true) => hash_string_view_array_inner::<T, true, false, true>(
414            array,
415            random_state,
416            hashes_buffer,
417        ),
418        (true, true, false) => hash_string_view_array_inner::<T, true, true, false>(
419            array,
420            random_state,
421            hashes_buffer,
422        ),
423        (true, true, true) => hash_string_view_array_inner::<T, true, true, true>(
424            array,
425            random_state,
426            hashes_buffer,
427        ),
428    }
429}
430
431/// Hash dictionary array with compile-time specialization for null handling.
432///
433/// Uses const generics to eliminate runtim branching in the hot loop:
434/// - `HAS_NULL_KEYS`: Whether to check for null dictionary keys
435/// - `HAS_NULL_VALUES`: Whether to check for null dictionary values
436/// - `MULTI_COL`: Whether to combine with existing hash (true) or initialize (false)
437#[cfg(not(feature = "force_hash_collisions"))]
438#[inline(never)]
439fn hash_dictionary_inner<
440    K: ArrowDictionaryKeyType,
441    const HAS_NULL_KEYS: bool,
442    const HAS_NULL_VALUES: bool,
443    const MULTI_COL: bool,
444>(
445    array: &DictionaryArray<K>,
446    random_state: &RandomState,
447    hashes_buffer: &mut [u64],
448) -> Result<()> {
449    // Hash each dictionary value once, and then use that computed
450    // hash for each key value to avoid a potentially expensive
451    // redundant hashing for large dictionary elements (e.g. strings)
452    let dict_values = array.values();
453    let mut dict_hashes = vec![0; dict_values.len()];
454    create_hashes([dict_values], random_state, &mut dict_hashes)?;
455
456    if HAS_NULL_KEYS {
457        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
458            if let Some(key) = key {
459                let idx = key.as_usize();
460                if !HAS_NULL_VALUES || dict_values.is_valid(idx) {
461                    if MULTI_COL {
462                        *hash = combine_hashes(dict_hashes[idx], *hash);
463                    } else {
464                        *hash = dict_hashes[idx];
465                    }
466                }
467            }
468        }
469    } else {
470        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().values()) {
471            let idx = key.as_usize();
472            if !HAS_NULL_VALUES || dict_values.is_valid(idx) {
473                if MULTI_COL {
474                    *hash = combine_hashes(dict_hashes[idx], *hash);
475                } else {
476                    *hash = dict_hashes[idx];
477                }
478            }
479        }
480    }
481    Ok(())
482}
483
484/// Hash the values in a dictionary array
485#[cfg(not(feature = "force_hash_collisions"))]
486fn hash_dictionary<K: ArrowDictionaryKeyType>(
487    array: &DictionaryArray<K>,
488    random_state: &RandomState,
489    hashes_buffer: &mut [u64],
490    multi_col: bool,
491) -> Result<()> {
492    let has_null_keys = array.keys().null_count() != 0;
493    let has_null_values = array.values().null_count() != 0;
494
495    // Dispatcher based on null presence and multi-column mode
496    // Should reduce branching within hot loops
497    match (has_null_keys, has_null_values, multi_col) {
498        (false, false, false) => hash_dictionary_inner::<K, false, false, false>(
499            array,
500            random_state,
501            hashes_buffer,
502        ),
503        (false, false, true) => hash_dictionary_inner::<K, false, false, true>(
504            array,
505            random_state,
506            hashes_buffer,
507        ),
508        (false, true, false) => hash_dictionary_inner::<K, false, true, false>(
509            array,
510            random_state,
511            hashes_buffer,
512        ),
513        (false, true, true) => hash_dictionary_inner::<K, false, true, true>(
514            array,
515            random_state,
516            hashes_buffer,
517        ),
518        (true, false, false) => hash_dictionary_inner::<K, true, false, false>(
519            array,
520            random_state,
521            hashes_buffer,
522        ),
523        (true, false, true) => hash_dictionary_inner::<K, true, false, true>(
524            array,
525            random_state,
526            hashes_buffer,
527        ),
528        (true, true, false) => hash_dictionary_inner::<K, true, true, false>(
529            array,
530            random_state,
531            hashes_buffer,
532        ),
533        (true, true, true) => hash_dictionary_inner::<K, true, true, true>(
534            array,
535            random_state,
536            hashes_buffer,
537        ),
538    }
539}
540
541#[cfg(not(feature = "force_hash_collisions"))]
542fn hash_struct_array(
543    array: &StructArray,
544    random_state: &RandomState,
545    hashes_buffer: &mut [u64],
546) -> Result<()> {
547    let nulls = array.nulls();
548    let row_len = array.len();
549
550    // Create hashes for each row that combines the hashes over all the column at that row.
551    let mut values_hashes = vec![0u64; row_len];
552    create_hashes(array.columns(), random_state, &mut values_hashes)?;
553
554    // Separate paths to avoid allocating Vec when there are no nulls
555    if let Some(nulls) = nulls {
556        for i in nulls.valid_indices() {
557            let hash = &mut hashes_buffer[i];
558            *hash = combine_hashes(*hash, values_hashes[i]);
559        }
560    } else {
561        for i in 0..row_len {
562            let hash = &mut hashes_buffer[i];
563            *hash = combine_hashes(*hash, values_hashes[i]);
564        }
565    }
566
567    Ok(())
568}
569
570// only adding this `cfg` b/c this function is only used with this `cfg`
571#[cfg(not(feature = "force_hash_collisions"))]
572fn hash_map_array(
573    array: &MapArray,
574    random_state: &RandomState,
575    hashes_buffer: &mut [u64],
576) -> Result<()> {
577    let nulls = array.nulls();
578    let offsets = array.offsets();
579
580    // Create hashes for each entry in each row
581    let first_offset = offsets.first().copied().unwrap_or_default() as usize;
582    let last_offset = offsets.last().copied().unwrap_or_default() as usize;
583    let entries_len = last_offset - first_offset;
584
585    // Only hash the entries that are actually referenced
586    let mut values_hashes = vec![0u64; entries_len];
587    let entries = array.entries();
588    let sliced_columns: Vec<ArrayRef> = entries
589        .columns()
590        .iter()
591        .map(|col| col.slice(first_offset, entries_len))
592        .collect();
593    create_hashes(&sliced_columns, random_state, &mut values_hashes)?;
594
595    // Combine the hashes for entries on each row with each other and previous hash for that row
596    // Adjust indices by first_offset since values_hashes is sliced starting from first_offset
597    if let Some(nulls) = nulls {
598        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
599            if nulls.is_valid(i) {
600                let hash = &mut hashes_buffer[i];
601                for values_hash in &values_hashes
602                    [start.as_usize() - first_offset..stop.as_usize() - first_offset]
603                {
604                    *hash = combine_hashes(*hash, *values_hash);
605                }
606            }
607        }
608    } else {
609        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
610            let hash = &mut hashes_buffer[i];
611            for values_hash in &values_hashes
612                [start.as_usize() - first_offset..stop.as_usize() - first_offset]
613            {
614                *hash = combine_hashes(*hash, *values_hash);
615            }
616        }
617    }
618
619    Ok(())
620}
621
622#[cfg(not(feature = "force_hash_collisions"))]
623fn hash_list_array<OffsetSize>(
624    array: &GenericListArray<OffsetSize>,
625    random_state: &RandomState,
626    hashes_buffer: &mut [u64],
627) -> Result<()>
628where
629    OffsetSize: OffsetSizeTrait,
630{
631    // In case values is sliced, hash only the bytes used by the offsets of this ListArray
632    let first_offset = array.value_offsets().first().cloned().unwrap_or_default();
633    let last_offset = array.value_offsets().last().cloned().unwrap_or_default();
634    let value_bytes_len = (last_offset - first_offset).as_usize();
635    let mut values_hashes = vec![0u64; value_bytes_len];
636    create_hashes(
637        [array
638            .values()
639            .slice(first_offset.as_usize(), value_bytes_len)],
640        random_state,
641        &mut values_hashes,
642    )?;
643
644    if array.null_count() > 0 {
645        for (i, (start, stop)) in array.value_offsets().iter().tuple_windows().enumerate()
646        {
647            if array.is_valid(i) {
648                let hash = &mut hashes_buffer[i];
649                for values_hash in &values_hashes[(*start - first_offset).as_usize()
650                    ..(*stop - first_offset).as_usize()]
651                {
652                    *hash = combine_hashes(*hash, *values_hash);
653                }
654            }
655        }
656    } else {
657        for ((start, stop), hash) in array
658            .value_offsets()
659            .iter()
660            .tuple_windows()
661            .zip(hashes_buffer.iter_mut())
662        {
663            for values_hash in &values_hashes
664                [(*start - first_offset).as_usize()..(*stop - first_offset).as_usize()]
665            {
666                *hash = combine_hashes(*hash, *values_hash);
667            }
668        }
669    }
670    Ok(())
671}
672
673#[cfg(not(feature = "force_hash_collisions"))]
674fn hash_list_view_array<OffsetSize>(
675    array: &GenericListViewArray<OffsetSize>,
676    random_state: &RandomState,
677    hashes_buffer: &mut [u64],
678) -> Result<()>
679where
680    OffsetSize: OffsetSizeTrait,
681{
682    let values = array.values();
683    let offsets = array.value_offsets();
684    let sizes = array.value_sizes();
685    let nulls = array.nulls();
686    let mut values_hashes = vec![0u64; values.len()];
687    create_hashes([values], random_state, &mut values_hashes)?;
688    if let Some(nulls) = nulls {
689        for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
690            if nulls.is_valid(i) {
691                let hash = &mut hashes_buffer[i];
692                let start = offset.as_usize();
693                let end = start + size.as_usize();
694                for values_hash in &values_hashes[start..end] {
695                    *hash = combine_hashes(*hash, *values_hash);
696                }
697            }
698        }
699    } else {
700        for (i, (offset, size)) in offsets.iter().zip(sizes.iter()).enumerate() {
701            let hash = &mut hashes_buffer[i];
702            let start = offset.as_usize();
703            let end = start + size.as_usize();
704            for values_hash in &values_hashes[start..end] {
705                *hash = combine_hashes(*hash, *values_hash);
706            }
707        }
708    }
709    Ok(())
710}
711
712#[cfg(not(feature = "force_hash_collisions"))]
713fn hash_union_array(
714    array: &UnionArray,
715    random_state: &RandomState,
716    hashes_buffer: &mut [u64],
717) -> Result<()> {
718    let DataType::Union(union_fields, _mode) = array.data_type() else {
719        unreachable!()
720    };
721
722    if array.is_dense() {
723        // Dense union: children only contain values of their type, so they're already compact.
724        // Use the default hashing approach which is efficient for dense unions.
725        hash_union_array_default(array, union_fields, random_state, hashes_buffer)
726    } else {
727        // Sparse union: each child has the same length as the union array.
728        // Optimization: only hash the elements that are actually referenced by type_ids,
729        // instead of hashing all K*N elements (where K = num types, N = array length).
730        hash_sparse_union_array(array, union_fields, random_state, hashes_buffer)
731    }
732}
733
734/// Default hashing for union arrays - hashes all elements of each child array fully.
735///
736/// This approach works for both dense and sparse union arrays:
737/// - Dense unions: children are compact (each child only contains values of that type)
738/// - Sparse unions: children have the same length as the union array
739///
740/// For sparse unions with 3+ types, the optimized take/scatter approach in
741/// `hash_sparse_union_array` is more efficient, but for 1-2 types or dense unions,
742/// this simpler approach is preferred.
743#[cfg(not(feature = "force_hash_collisions"))]
744fn hash_union_array_default(
745    array: &UnionArray,
746    union_fields: &UnionFields,
747    random_state: &RandomState,
748    hashes_buffer: &mut [u64],
749) -> Result<()> {
750    let mut child_hashes: HashMap<i8, Vec<u64>> =
751        HashMap::with_capacity(union_fields.len());
752
753    // Hash each child array fully
754    for (type_id, _field) in union_fields.iter() {
755        let child = array.child(type_id);
756        let mut child_hash_buffer = vec![0; child.len()];
757        create_hashes([child], random_state, &mut child_hash_buffer)?;
758
759        child_hashes.insert(type_id, child_hash_buffer);
760    }
761
762    // Combine hashes for each row using the appropriate child offset
763    // For dense unions: value_offset points to the actual position in the child
764    // For sparse unions: value_offset equals the row index
765    #[expect(clippy::needless_range_loop)]
766    for i in 0..array.len() {
767        let type_id = array.type_id(i);
768        let child_offset = array.value_offset(i);
769
770        let child_hash = child_hashes.get(&type_id).expect("invalid type_id");
771        hashes_buffer[i] = combine_hashes(hashes_buffer[i], child_hash[child_offset]);
772    }
773
774    Ok(())
775}
776
777/// Hash a sparse union array.
778/// Sparse unions have child arrays with the same length as the union array.
779/// For 3+ types, we optimize by only hashing the N elements that are actually used
780/// (via take/scatter), instead of hashing all K*N elements.
781///
782/// For 1-2 types, the overhead of take/scatter outweighs the benefit, so we use
783/// the default approach of hashing all children (same as dense unions).
784#[cfg(not(feature = "force_hash_collisions"))]
785fn hash_sparse_union_array(
786    array: &UnionArray,
787    union_fields: &UnionFields,
788    random_state: &RandomState,
789    hashes_buffer: &mut [u64],
790) -> Result<()> {
791    use std::collections::HashMap;
792
793    // For 1-2 types, the take/scatter overhead isn't worth it.
794    // Fall back to the default approach (same as dense union).
795    if union_fields.len() <= 2 {
796        return hash_union_array_default(
797            array,
798            union_fields,
799            random_state,
800            hashes_buffer,
801        );
802    }
803
804    let type_ids = array.type_ids();
805
806    // Group indices by type_id
807    let mut indices_by_type: HashMap<i8, Vec<u32>> = HashMap::new();
808    for (i, &type_id) in type_ids.iter().enumerate() {
809        indices_by_type.entry(type_id).or_default().push(i as u32);
810    }
811
812    // For each type, extract only the needed elements, hash them, and scatter back
813    for (type_id, _field) in union_fields.iter() {
814        if let Some(indices) = indices_by_type.get(&type_id) {
815            if indices.is_empty() {
816                continue;
817            }
818
819            let child = array.child(type_id);
820            let indices_array = UInt32Array::from(indices.clone());
821
822            // Extract only the elements we need using take()
823            let filtered = take(child.as_ref(), &indices_array, None)?;
824
825            // Hash the filtered array
826            let mut filtered_hashes = vec![0u64; filtered.len()];
827            create_hashes([&filtered], random_state, &mut filtered_hashes)?;
828
829            // Scatter hashes back to correct positions
830            for (hash, &idx) in filtered_hashes.iter().zip(indices.iter()) {
831                hashes_buffer[idx as usize] =
832                    combine_hashes(hashes_buffer[idx as usize], *hash);
833            }
834        }
835    }
836
837    Ok(())
838}
839
840#[cfg(not(feature = "force_hash_collisions"))]
841fn hash_fixed_list_array(
842    array: &FixedSizeListArray,
843    random_state: &RandomState,
844    hashes_buffer: &mut [u64],
845) -> Result<()> {
846    let values = array.values();
847    let value_length = array.value_length() as usize;
848    let nulls = array.nulls();
849    let mut values_hashes = vec![0u64; values.len()];
850    create_hashes([values], random_state, &mut values_hashes)?;
851    if let Some(nulls) = nulls {
852        for i in 0..array.len() {
853            if nulls.is_valid(i) {
854                let hash = &mut hashes_buffer[i];
855                for values_hash in
856                    &values_hashes[i * value_length..(i + 1) * value_length]
857                {
858                    *hash = combine_hashes(*hash, *values_hash);
859                }
860            }
861        }
862    } else {
863        for i in 0..array.len() {
864            let hash = &mut hashes_buffer[i];
865            for values_hash in &values_hashes[i * value_length..(i + 1) * value_length] {
866                *hash = combine_hashes(*hash, *values_hash);
867            }
868        }
869    }
870    Ok(())
871}
872
873/// Inner hash function for RunArray
874#[inline(never)]
875#[cfg(not(feature = "force_hash_collisions"))]
876fn hash_run_array_inner<
877    R: RunEndIndexType,
878    const HAS_NULL_VALUES: bool,
879    const REHASH: bool,
880>(
881    array: &RunArray<R>,
882    random_state: &RandomState,
883    hashes_buffer: &mut [u64],
884) -> Result<()> {
885    // We find the relevant runs that cover potentially sliced arrays, so we can only hash those
886    // values. Then we find the runs that refer to the original runs and ensure that we apply
887    // hashes correctly to the sliced, whether sliced at the start, end, or both.
888    let array_offset = array.offset();
889    let array_len = array.len();
890
891    if array_len == 0 {
892        return Ok(());
893    }
894
895    let run_ends = array.run_ends();
896    let run_ends_values = run_ends.values();
897    let values = array.values();
898
899    let start_physical_index = array.get_start_physical_index();
900    // get_end_physical_index returns the inclusive last index, but we need the exclusive range end
901    // for the operations we use below.
902    let end_physical_index = array.get_end_physical_index() + 1;
903
904    let sliced_values = values.slice(
905        start_physical_index,
906        end_physical_index - start_physical_index,
907    );
908    let mut values_hashes = vec![0u64; sliced_values.len()];
909    create_hashes(
910        std::slice::from_ref(&sliced_values),
911        random_state,
912        &mut values_hashes,
913    )?;
914
915    let mut start_in_slice = 0;
916    for (adjusted_physical_index, &absolute_run_end) in run_ends_values
917        [start_physical_index..end_physical_index]
918        .iter()
919        .enumerate()
920    {
921        let absolute_run_end = absolute_run_end.as_usize();
922        let end_in_slice = (absolute_run_end - array_offset).min(array_len);
923
924        if HAS_NULL_VALUES && sliced_values.is_null(adjusted_physical_index) {
925            start_in_slice = end_in_slice;
926            continue;
927        }
928
929        let value_hash = values_hashes[adjusted_physical_index];
930        let run_slice = &mut hashes_buffer[start_in_slice..end_in_slice];
931
932        if REHASH {
933            for hash in run_slice.iter_mut() {
934                *hash = combine_hashes(value_hash, *hash);
935            }
936        } else {
937            run_slice.fill(value_hash);
938        }
939
940        start_in_slice = end_in_slice;
941    }
942
943    Ok(())
944}
945
946#[cfg(not(feature = "force_hash_collisions"))]
947fn hash_run_array<R: RunEndIndexType>(
948    array: &RunArray<R>,
949    random_state: &RandomState,
950    hashes_buffer: &mut [u64],
951    rehash: bool,
952) -> Result<()> {
953    let has_null_values = array.values().null_count() != 0;
954
955    match (has_null_values, rehash) {
956        (false, false) => {
957            hash_run_array_inner::<R, false, false>(array, random_state, hashes_buffer)
958        }
959        (false, true) => {
960            hash_run_array_inner::<R, false, true>(array, random_state, hashes_buffer)
961        }
962        (true, false) => {
963            hash_run_array_inner::<R, true, false>(array, random_state, hashes_buffer)
964        }
965        (true, true) => {
966            hash_run_array_inner::<R, true, true>(array, random_state, hashes_buffer)
967        }
968    }
969}
970
971/// Internal helper function that hashes a single array and either initializes or combines
972/// the hash values in the buffer.
973#[cfg(not(feature = "force_hash_collisions"))]
974fn hash_single_array(
975    array: &dyn Array,
976    random_state: &RandomState,
977    hashes_buffer: &mut [u64],
978    rehash: bool,
979) -> Result<()> {
980    downcast_primitive_array! {
981        array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
982        DataType::Null => hash_null(random_state, hashes_buffer, rehash),
983        DataType::Boolean => hash_array(&as_boolean_array(array)?, random_state, hashes_buffer, rehash),
984        DataType::Utf8 => hash_array(&as_string_array(array)?, random_state, hashes_buffer, rehash),
985        DataType::Utf8View => hash_generic_byte_view_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
986        DataType::LargeUtf8 => hash_array(&as_largestring_array(array), random_state, hashes_buffer, rehash),
987        DataType::Binary => hash_array(&as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
988        DataType::BinaryView => hash_generic_byte_view_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
989        DataType::LargeBinary => hash_array(&as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
990        DataType::FixedSizeBinary(_) => {
991            let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
992            hash_array(&array, random_state, hashes_buffer, rehash)
993        }
994        DataType::Dictionary(_, _) => downcast_dictionary_array! {
995            array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
996            _ => unreachable!()
997        }
998        DataType::Struct(_) => {
999            let array = as_struct_array(array)?;
1000            hash_struct_array(array, random_state, hashes_buffer)?;
1001        }
1002        DataType::List(_) => {
1003            let array = as_list_array(array)?;
1004            hash_list_array(array, random_state, hashes_buffer)?;
1005        }
1006        DataType::LargeList(_) => {
1007            let array = as_large_list_array(array)?;
1008            hash_list_array(array, random_state, hashes_buffer)?;
1009        }
1010        DataType::ListView(_) => {
1011            let array = as_list_view_array(array)?;
1012            hash_list_view_array(array, random_state, hashes_buffer)?;
1013        }
1014        DataType::LargeListView(_) => {
1015            let array = as_large_list_view_array(array)?;
1016            hash_list_view_array(array, random_state, hashes_buffer)?;
1017        }
1018        DataType::Map(_, _) => {
1019            let array = as_map_array(array)?;
1020            hash_map_array(array, random_state, hashes_buffer)?;
1021        }
1022        DataType::FixedSizeList(_,_) => {
1023            let array = as_fixed_size_list_array(array)?;
1024            hash_fixed_list_array(array, random_state, hashes_buffer)?;
1025        }
1026        DataType::Union(_, _) => {
1027            let array = as_union_array(array)?;
1028            hash_union_array(array, random_state, hashes_buffer)?;
1029        }
1030        DataType::RunEndEncoded(_, _) => downcast_run_array! {
1031            array => hash_run_array(array, random_state, hashes_buffer, rehash)?,
1032            _ => unreachable!()
1033        }
1034        _ => {
1035            // This is internal because we should have caught this before.
1036            return _internal_err!(
1037                "Unsupported data type in hasher: {}",
1038                array.data_type()
1039            );
1040        }
1041    }
1042    Ok(())
1043}
1044
1045/// Test version of `hash_single_array` that forces all hashes to collide to zero.
1046#[cfg(feature = "force_hash_collisions")]
1047fn hash_single_array(
1048    _array: &dyn Array,
1049    _random_state: &RandomState,
1050    hashes_buffer: &mut [u64],
1051    _rehash: bool,
1052) -> Result<()> {
1053    for hash in hashes_buffer.iter_mut() {
1054        *hash = 0
1055    }
1056    Ok(())
1057}
1058
1059/// Something that can be returned as a `&dyn Array`.
1060///
1061/// We want `create_hashes` to accept either `&dyn Array` or `ArrayRef`,
1062/// and this seems the best way to do so.
1063///
1064/// We tried having it accept `AsRef<dyn Array>`
1065/// but that is not implemented for and cannot be implemented for
1066/// `&dyn Array` so callers that have the latter would not be able
1067/// to call `create_hashes` directly. This shim trait makes it possible.
1068pub trait AsDynArray {
1069    fn as_dyn_array(&self) -> &dyn Array;
1070}
1071
1072impl AsDynArray for dyn Array {
1073    fn as_dyn_array(&self) -> &dyn Array {
1074        self
1075    }
1076}
1077
1078impl AsDynArray for &dyn Array {
1079    fn as_dyn_array(&self) -> &dyn Array {
1080        *self
1081    }
1082}
1083
1084impl AsDynArray for ArrayRef {
1085    fn as_dyn_array(&self) -> &dyn Array {
1086        self.as_ref()
1087    }
1088}
1089
1090impl AsDynArray for &ArrayRef {
1091    fn as_dyn_array(&self) -> &dyn Array {
1092        self.as_ref()
1093    }
1094}
1095
1096/// Creates hash values for every row, based on the values in the columns.
1097///
1098/// The number of rows to hash is determined by `hashes_buffer.len()`.
1099/// `hashes_buffer` should be pre-sized appropriately.
1100pub fn create_hashes<'a, I, T>(
1101    arrays: I,
1102    random_state: &RandomState,
1103    hashes_buffer: &'a mut [u64],
1104) -> Result<&'a mut [u64]>
1105where
1106    I: IntoIterator<Item = T>,
1107    T: AsDynArray,
1108{
1109    for (i, array) in arrays.into_iter().enumerate() {
1110        // combine hashes with `combine_hashes` for all columns besides the first
1111        let rehash = i >= 1;
1112        hash_single_array(array.as_dyn_array(), random_state, hashes_buffer, rehash)?;
1113    }
1114    Ok(hashes_buffer)
1115}
1116
1117#[cfg(test)]
1118mod tests {
1119    use std::sync::Arc;
1120
1121    use arrow::array::*;
1122    #[cfg(not(feature = "force_hash_collisions"))]
1123    use arrow::datatypes::*;
1124
1125    use super::*;
1126
1127    #[test]
1128    fn create_hashes_for_decimal_array() -> Result<()> {
1129        let array = vec![1, 2, 3, 4]
1130            .into_iter()
1131            .map(Some)
1132            .collect::<Decimal128Array>()
1133            .with_precision_and_scale(20, 3)
1134            .unwrap();
1135        let array_ref: ArrayRef = Arc::new(array);
1136        let random_state = RandomState::with_seed(0);
1137        let hashes_buff = &mut vec![0; array_ref.len()];
1138        let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
1139        assert_eq!(hashes.len(), 4);
1140        Ok(())
1141    }
1142
1143    #[test]
1144    fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
1145        let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
1146        let random_state = RandomState::with_seed(0);
1147        let hashes_buff = &mut [0; 0];
1148        let hashes = create_hashes(
1149            &[Arc::new(empty_array) as ArrayRef],
1150            &random_state,
1151            hashes_buff,
1152        )?;
1153        assert_eq!(hashes, &Vec::<u64>::new());
1154        Ok(())
1155    }
1156
1157    #[test]
1158    fn create_hashes_for_float_arrays() -> Result<()> {
1159        let f32_arr: ArrayRef =
1160            Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
1161        let f64_arr: ArrayRef =
1162            Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
1163
1164        let random_state = RandomState::with_seed(0);
1165        let hashes_buff = &mut vec![0; f32_arr.len()];
1166        let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
1167        assert_eq!(hashes.len(), 4,);
1168
1169        let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?;
1170        assert_eq!(hashes.len(), 4,);
1171
1172        Ok(())
1173    }
1174
1175    macro_rules! create_hash_binary {
1176        ($NAME:ident, $ARRAY:ty) => {
1177            #[cfg(not(feature = "force_hash_collisions"))]
1178            #[test]
1179            fn $NAME() {
1180                let binary = [
1181                    Some(b"short".to_byte_slice()),
1182                    None,
1183                    Some(b"long but different 12 bytes string"),
1184                    Some(b"short2"),
1185                    Some(b"Longer than 12 bytes string"),
1186                    Some(b"short"),
1187                    Some(b"Longer than 12 bytes string"),
1188                ];
1189
1190                let binary_array: ArrayRef =
1191                    Arc::new(binary.iter().cloned().collect::<$ARRAY>());
1192
1193                let random_state = RandomState::with_seed(0);
1194
1195                let mut binary_hashes = vec![0; binary.len()];
1196                create_hashes(&[binary_array], &random_state, &mut binary_hashes)
1197                    .unwrap();
1198
1199                // Null values result in a zero hash,
1200                for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
1201                    match val {
1202                        Some(_) => assert_ne!(*hash, 0),
1203                        None => assert_eq!(*hash, 0),
1204                    }
1205                }
1206
1207                // Same values should map to same hash values
1208                assert_eq!(binary[0], binary[5]);
1209                assert_eq!(binary[4], binary[6]);
1210
1211                // different binary should map to different hash values
1212                assert_ne!(binary[0], binary[2]);
1213            }
1214        };
1215    }
1216
1217    create_hash_binary!(binary_array, BinaryArray);
1218    create_hash_binary!(large_binary_array, LargeBinaryArray);
1219    create_hash_binary!(binary_view_array, BinaryViewArray);
1220
1221    #[test]
1222    fn create_hashes_fixed_size_binary() -> Result<()> {
1223        let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
1224        let fixed_size_binary_array: ArrayRef =
1225            Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
1226
1227        let random_state = RandomState::with_seed(0);
1228        let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
1229        let hashes =
1230            create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
1231        assert_eq!(hashes.len(), 3,);
1232
1233        Ok(())
1234    }
1235
1236    macro_rules! create_hash_string {
1237        ($NAME:ident, $ARRAY:ty) => {
1238            #[cfg(not(feature = "force_hash_collisions"))]
1239            #[test]
1240            fn $NAME() {
1241                let strings = [
1242                    Some("short"),
1243                    None,
1244                    Some("long but different 12 bytes string"),
1245                    Some("short2"),
1246                    Some("Longer than 12 bytes string"),
1247                    Some("short"),
1248                    Some("Longer than 12 bytes string"),
1249                ];
1250
1251                let string_array: ArrayRef =
1252                    Arc::new(strings.iter().cloned().collect::<$ARRAY>());
1253                let dict_array: ArrayRef = Arc::new(
1254                    strings
1255                        .iter()
1256                        .cloned()
1257                        .collect::<DictionaryArray<Int8Type>>(),
1258                );
1259
1260                let random_state = RandomState::with_seed(0);
1261
1262                let mut string_hashes = vec![0; strings.len()];
1263                create_hashes(&[string_array], &random_state, &mut string_hashes)
1264                    .unwrap();
1265
1266                let mut dict_hashes = vec![0; strings.len()];
1267                create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
1268
1269                // Null values result in a zero hash,
1270                for (val, hash) in strings.iter().zip(string_hashes.iter()) {
1271                    match val {
1272                        Some(_) => assert_ne!(*hash, 0),
1273                        None => assert_eq!(*hash, 0),
1274                    }
1275                }
1276
1277                // same logical values should hash to the same hash value
1278                assert_eq!(string_hashes, dict_hashes);
1279
1280                // Same values should map to same hash values
1281                assert_eq!(strings[0], strings[5]);
1282                assert_eq!(strings[4], strings[6]);
1283
1284                // different strings should map to different hash values
1285                assert_ne!(strings[0], strings[2]);
1286            }
1287        };
1288    }
1289
1290    create_hash_string!(string_array, StringArray);
1291    create_hash_string!(large_string_array, LargeStringArray);
1292    create_hash_string!(string_view_array, StringArray);
1293    create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
1294
1295    #[test]
1296    #[cfg(not(feature = "force_hash_collisions"))]
1297    fn create_hashes_for_run_array() -> Result<()> {
1298        let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
1299        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
1300        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
1301
1302        let random_state = RandomState::with_seed(0);
1303        let hashes_buff = &mut vec![0; array.len()];
1304        let hashes = create_hashes(
1305            &[Arc::clone(&array) as ArrayRef],
1306            &random_state,
1307            hashes_buff,
1308        )?;
1309
1310        assert_eq!(hashes.len(), 7);
1311        assert_eq!(hashes[0], hashes[1]);
1312        assert_eq!(hashes[2], hashes[3]);
1313        assert_eq!(hashes[3], hashes[4]);
1314        assert_eq!(hashes[5], hashes[6]);
1315        assert_ne!(hashes[0], hashes[2]);
1316        assert_ne!(hashes[2], hashes[5]);
1317        assert_ne!(hashes[0], hashes[5]);
1318
1319        Ok(())
1320    }
1321
1322    #[test]
1323    #[cfg(not(feature = "force_hash_collisions"))]
1324    fn create_multi_column_hash_with_run_array() -> Result<()> {
1325        let int_array = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7]));
1326        let values = Arc::new(StringArray::from(vec!["foo", "bar", "baz"]));
1327        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
1328        let run_array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
1329
1330        let random_state = RandomState::with_seed(0);
1331        let mut one_col_hashes = vec![0; int_array.len()];
1332        create_hashes(
1333            &[Arc::clone(&int_array) as ArrayRef],
1334            &random_state,
1335            &mut one_col_hashes,
1336        )?;
1337
1338        let mut two_col_hashes = vec![0; int_array.len()];
1339        create_hashes(
1340            &[
1341                Arc::clone(&int_array) as ArrayRef,
1342                Arc::clone(&run_array) as ArrayRef,
1343            ],
1344            &random_state,
1345            &mut two_col_hashes,
1346        )?;
1347
1348        assert_eq!(one_col_hashes.len(), 7);
1349        assert_eq!(two_col_hashes.len(), 7);
1350        assert_ne!(one_col_hashes, two_col_hashes);
1351
1352        let diff_0_vs_1_one_col = one_col_hashes[0] != one_col_hashes[1];
1353        let diff_0_vs_1_two_col = two_col_hashes[0] != two_col_hashes[1];
1354        assert_eq!(diff_0_vs_1_one_col, diff_0_vs_1_two_col);
1355
1356        let diff_2_vs_3_one_col = one_col_hashes[2] != one_col_hashes[3];
1357        let diff_2_vs_3_two_col = two_col_hashes[2] != two_col_hashes[3];
1358        assert_eq!(diff_2_vs_3_one_col, diff_2_vs_3_two_col);
1359
1360        Ok(())
1361    }
1362
1363    #[test]
1364    // Tests actual values of hashes, which are different if forcing collisions
1365    #[cfg(not(feature = "force_hash_collisions"))]
1366    fn create_hashes_for_dict_arrays() {
1367        let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
1368
1369        let string_array: ArrayRef =
1370            Arc::new(strings.iter().cloned().collect::<StringArray>());
1371        let dict_array: ArrayRef = Arc::new(
1372            strings
1373                .iter()
1374                .cloned()
1375                .collect::<DictionaryArray<Int8Type>>(),
1376        );
1377
1378        let random_state = RandomState::with_seed(0);
1379
1380        let mut string_hashes = vec![0; strings.len()];
1381        create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
1382
1383        let mut dict_hashes = vec![0; strings.len()];
1384        create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
1385
1386        // Null values result in a zero hash,
1387        for (val, hash) in strings.iter().zip(string_hashes.iter()) {
1388            match val {
1389                Some(_) => assert_ne!(*hash, 0),
1390                None => assert_eq!(*hash, 0),
1391            }
1392        }
1393
1394        // same logical values should hash to the same hash value
1395        assert_eq!(string_hashes, dict_hashes);
1396
1397        // Same values should map to same hash values
1398        assert_eq!(strings[1], strings[4]);
1399        assert_eq!(dict_hashes[1], dict_hashes[4]);
1400        assert_eq!(strings[0], strings[3]);
1401        assert_eq!(dict_hashes[0], dict_hashes[3]);
1402
1403        // different strings should map to different hash values
1404        assert_ne!(strings[0], strings[2]);
1405        assert_ne!(dict_hashes[0], dict_hashes[2]);
1406    }
1407
1408    #[test]
1409    // Tests actual values of hashes, which are different if forcing collisions
1410    #[cfg(not(feature = "force_hash_collisions"))]
1411    fn create_hashes_for_list_arrays() {
1412        let data = vec![
1413            Some(vec![Some(0), Some(1), Some(2)]),
1414            None,
1415            Some(vec![Some(3), None, Some(5)]),
1416            Some(vec![Some(3), None, Some(5)]),
1417            None,
1418            Some(vec![Some(0), Some(1), Some(2)]),
1419            Some(vec![]),
1420        ];
1421        let list_array =
1422            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
1423        let random_state = RandomState::with_seed(0);
1424        let mut hashes = vec![0; list_array.len()];
1425        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
1426        assert_eq!(hashes[0], hashes[5]);
1427        assert_eq!(hashes[1], hashes[4]);
1428        assert_eq!(hashes[2], hashes[3]);
1429        assert_eq!(hashes[1], hashes[6]); // null vs empty list
1430    }
1431
1432    #[test]
1433    #[cfg(not(feature = "force_hash_collisions"))]
1434    fn create_hashes_for_sliced_list_arrays() {
1435        let data = vec![
1436            Some(vec![Some(0), Some(1), Some(2)]),
1437            None,
1438            // Slice from here
1439            Some(vec![Some(3), None, Some(5)]),
1440            Some(vec![Some(3), None, Some(5)]),
1441            None,
1442            // To here
1443            Some(vec![Some(0), Some(1), Some(2)]),
1444            Some(vec![]),
1445        ];
1446        let list_array =
1447            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
1448        let list_array = list_array.slice(2, 3);
1449        let random_state = RandomState::with_seed(0);
1450        let mut hashes = vec![0; list_array.len()];
1451        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
1452        assert_eq!(hashes[0], hashes[1]);
1453        assert_ne!(hashes[1], hashes[2]);
1454    }
1455
1456    #[test]
1457    // Tests actual values of hashes, which are different if forcing collisions
1458    #[cfg(not(feature = "force_hash_collisions"))]
1459    fn create_hashes_for_list_view_arrays() {
1460        use arrow::buffer::{NullBuffer, ScalarBuffer};
1461
1462        // Create values array: [0, 1, 2, 3, null, 5]
1463        let values = Arc::new(Int32Array::from(vec![
1464            Some(0),
1465            Some(1),
1466            Some(2),
1467            Some(3),
1468            None,
1469            Some(5),
1470        ])) as ArrayRef;
1471        let field = Arc::new(Field::new("item", DataType::Int32, true));
1472
1473        // Create ListView with the following logical structure:
1474        // Row 0: [0, 1, 2]        (offset=0, size=3)
1475        // Row 1: null             (null bit set)
1476        // Row 2: [3, null, 5]     (offset=3, size=3)
1477        // Row 3: [3, null, 5]     (offset=3, size=3) - same as row 2
1478        // Row 4: null             (null bit set)
1479        // Row 5: [0, 1, 2]        (offset=0, size=3) - same as row 0
1480        // Row 6: []               (offset=0, size=0) - empty list
1481        let offsets = ScalarBuffer::from(vec![0i32, 0, 3, 3, 0, 0, 0]);
1482        let sizes = ScalarBuffer::from(vec![3i32, 0, 3, 3, 0, 3, 0]);
1483        let nulls = Some(NullBuffer::from(vec![
1484            true, false, true, true, false, true, true,
1485        ]));
1486
1487        let list_view_array =
1488            Arc::new(ListViewArray::new(field, offsets, sizes, values, nulls))
1489                as ArrayRef;
1490
1491        let random_state = RandomState::with_seed(0);
1492        let mut hashes = vec![0; list_view_array.len()];
1493        create_hashes(&[list_view_array], &random_state, &mut hashes).unwrap();
1494
1495        assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
1496        assert_eq!(hashes[1], hashes[4]); // both null
1497        assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
1498        assert_eq!(hashes[1], hashes[6]); // null vs empty list
1499
1500        // Negative tests: different content should produce different hashes
1501        assert_ne!(hashes[0], hashes[2]); // [0, 1, 2] vs [3, null, 5]
1502        assert_ne!(hashes[0], hashes[6]); // [0, 1, 2] vs []
1503        assert_ne!(hashes[2], hashes[6]); // [3, null, 5] vs []
1504    }
1505
1506    #[test]
1507    // Tests actual values of hashes, which are different if forcing collisions
1508    #[cfg(not(feature = "force_hash_collisions"))]
1509    fn create_hashes_for_large_list_view_arrays() {
1510        use arrow::buffer::{NullBuffer, ScalarBuffer};
1511
1512        // Create values array: [0, 1, 2, 3, null, 5]
1513        let values = Arc::new(Int32Array::from(vec![
1514            Some(0),
1515            Some(1),
1516            Some(2),
1517            Some(3),
1518            None,
1519            Some(5),
1520        ])) as ArrayRef;
1521        let field = Arc::new(Field::new("item", DataType::Int32, true));
1522
1523        // Create LargeListView with the following logical structure:
1524        // Row 0: [0, 1, 2]        (offset=0, size=3)
1525        // Row 1: null             (null bit set)
1526        // Row 2: [3, null, 5]     (offset=3, size=3)
1527        // Row 3: [3, null, 5]     (offset=3, size=3) - same as row 2
1528        // Row 4: null             (null bit set)
1529        // Row 5: [0, 1, 2]        (offset=0, size=3) - same as row 0
1530        // Row 6: []               (offset=0, size=0) - empty list
1531        let offsets = ScalarBuffer::from(vec![0i64, 0, 3, 3, 0, 0, 0]);
1532        let sizes = ScalarBuffer::from(vec![3i64, 0, 3, 3, 0, 3, 0]);
1533        let nulls = Some(NullBuffer::from(vec![
1534            true, false, true, true, false, true, true,
1535        ]));
1536
1537        let large_list_view_array = Arc::new(LargeListViewArray::new(
1538            field, offsets, sizes, values, nulls,
1539        )) as ArrayRef;
1540
1541        let random_state = RandomState::with_seed(0);
1542        let mut hashes = vec![0; large_list_view_array.len()];
1543        create_hashes(&[large_list_view_array], &random_state, &mut hashes).unwrap();
1544
1545        assert_eq!(hashes[0], hashes[5]); // same content [0, 1, 2]
1546        assert_eq!(hashes[1], hashes[4]); // both null
1547        assert_eq!(hashes[2], hashes[3]); // same content [3, null, 5]
1548        assert_eq!(hashes[1], hashes[6]); // null vs empty list
1549
1550        // Negative tests: different content should produce different hashes
1551        assert_ne!(hashes[0], hashes[2]); // [0, 1, 2] vs [3, null, 5]
1552        assert_ne!(hashes[0], hashes[6]); // [0, 1, 2] vs []
1553        assert_ne!(hashes[2], hashes[6]); // [3, null, 5] vs []
1554    }
1555
1556    #[test]
1557    // Tests actual values of hashes, which are different if forcing collisions
1558    #[cfg(not(feature = "force_hash_collisions"))]
1559    fn create_hashes_for_fixed_size_list_arrays() {
1560        let data = vec![
1561            Some(vec![Some(0), Some(1), Some(2)]),
1562            None,
1563            Some(vec![Some(3), None, Some(5)]),
1564            Some(vec![Some(3), None, Some(5)]),
1565            None,
1566            Some(vec![Some(0), Some(1), Some(2)]),
1567        ];
1568        let list_array =
1569            Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
1570                data, 3,
1571            )) as ArrayRef;
1572        let random_state = RandomState::with_seed(0);
1573        let mut hashes = vec![0; list_array.len()];
1574        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
1575        assert_eq!(hashes[0], hashes[5]);
1576        assert_eq!(hashes[1], hashes[4]);
1577        assert_eq!(hashes[2], hashes[3]);
1578    }
1579
1580    #[test]
1581    // Tests actual values of hashes, which are different if forcing collisions
1582    #[cfg(not(feature = "force_hash_collisions"))]
1583    fn create_hashes_for_struct_arrays() {
1584        use arrow::buffer::Buffer;
1585
1586        let boolarr = Arc::new(BooleanArray::from(vec![
1587            false, false, true, true, true, true,
1588        ]));
1589        let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31]));
1590
1591        let struct_array = StructArray::from((
1592            vec![
1593                (
1594                    Arc::new(Field::new("bool", DataType::Boolean, false)),
1595                    Arc::clone(&boolarr) as ArrayRef,
1596                ),
1597                (
1598                    Arc::new(Field::new("i32", DataType::Int32, false)),
1599                    Arc::clone(&i32arr) as ArrayRef,
1600                ),
1601                (
1602                    Arc::new(Field::new("i32", DataType::Int32, false)),
1603                    Arc::clone(&i32arr) as ArrayRef,
1604                ),
1605                (
1606                    Arc::new(Field::new("bool", DataType::Boolean, false)),
1607                    Arc::clone(&boolarr) as ArrayRef,
1608                ),
1609            ],
1610            Buffer::from(&[0b001011]),
1611        ));
1612
1613        assert!(struct_array.is_valid(0));
1614        assert!(struct_array.is_valid(1));
1615        assert!(struct_array.is_null(2));
1616        assert!(struct_array.is_valid(3));
1617        assert!(struct_array.is_null(4));
1618        assert!(struct_array.is_null(5));
1619
1620        let array = Arc::new(struct_array) as ArrayRef;
1621
1622        let random_state = RandomState::with_seed(0);
1623        let mut hashes = vec![0; array.len()];
1624        create_hashes(&[array], &random_state, &mut hashes).unwrap();
1625        assert_eq!(hashes[0], hashes[1]);
1626        // same value but the third row ( hashes[2] ) is null
1627        assert_ne!(hashes[2], hashes[3]);
1628        // different values but both are null
1629        assert_eq!(hashes[4], hashes[5]);
1630    }
1631
1632    #[test]
1633    // Tests actual values of hashes, which are different if forcing collisions
1634    #[cfg(not(feature = "force_hash_collisions"))]
1635    fn create_hashes_for_struct_arrays_more_column_than_row() {
1636        let struct_array = StructArray::from(vec![
1637            (
1638                Arc::new(Field::new("bool", DataType::Boolean, false)),
1639                Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
1640            ),
1641            (
1642                Arc::new(Field::new("i32-1", DataType::Int32, false)),
1643                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
1644            ),
1645            (
1646                Arc::new(Field::new("i32-2", DataType::Int32, false)),
1647                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
1648            ),
1649            (
1650                Arc::new(Field::new("i32-3", DataType::Int32, false)),
1651                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
1652            ),
1653        ]);
1654
1655        assert!(struct_array.is_valid(0));
1656        assert!(struct_array.is_valid(1));
1657
1658        let array = Arc::new(struct_array) as ArrayRef;
1659        let random_state = RandomState::with_seed(0);
1660        let mut hashes = vec![0; array.len()];
1661        create_hashes(&[array], &random_state, &mut hashes).unwrap();
1662        assert_eq!(hashes[0], hashes[1]);
1663    }
1664
1665    #[test]
1666    // Tests actual values of hashes, which are different if forcing collisions
1667    #[cfg(not(feature = "force_hash_collisions"))]
1668    fn create_hashes_for_map_arrays() {
1669        let mut builder =
1670            MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
1671        // Row 0
1672        builder.keys().append_value("key1");
1673        builder.keys().append_value("key2");
1674        builder.values().append_value(1);
1675        builder.values().append_value(2);
1676        builder.append(true).unwrap();
1677        // Row 1
1678        builder.keys().append_value("key1");
1679        builder.keys().append_value("key2");
1680        builder.values().append_value(1);
1681        builder.values().append_value(2);
1682        builder.append(true).unwrap();
1683        // Row 2
1684        builder.keys().append_value("key1");
1685        builder.keys().append_value("key2");
1686        builder.values().append_value(1);
1687        builder.values().append_value(3);
1688        builder.append(true).unwrap();
1689        // Row 3
1690        builder.keys().append_value("key1");
1691        builder.keys().append_value("key3");
1692        builder.values().append_value(1);
1693        builder.values().append_value(2);
1694        builder.append(true).unwrap();
1695        // Row 4
1696        builder.keys().append_value("key1");
1697        builder.values().append_value(1);
1698        builder.append(true).unwrap();
1699        // Row 5
1700        builder.keys().append_value("key1");
1701        builder.values().append_null();
1702        builder.append(true).unwrap();
1703        // Row 6
1704        builder.append(true).unwrap();
1705        // Row 7
1706        builder.keys().append_value("key1");
1707        builder.values().append_value(1);
1708        builder.append(false).unwrap();
1709
1710        let array = Arc::new(builder.finish()) as ArrayRef;
1711
1712        let random_state = RandomState::with_seed(0);
1713        let mut hashes = vec![0; array.len()];
1714        create_hashes(&[array], &random_state, &mut hashes).unwrap();
1715        assert_eq!(hashes[0], hashes[1]); // same value
1716        assert_ne!(hashes[0], hashes[2]); // different value
1717        assert_ne!(hashes[0], hashes[3]); // different key
1718        assert_ne!(hashes[0], hashes[4]); // missing an entry
1719        assert_ne!(hashes[4], hashes[5]); // filled vs null value
1720        assert_eq!(hashes[6], hashes[7]); // empty vs null map
1721    }
1722
1723    #[test]
1724    // Tests actual values of hashes, which are different if forcing collisions
1725    #[cfg(not(feature = "force_hash_collisions"))]
1726    fn create_multi_column_hash_for_dict_arrays() {
1727        let strings1 = [Some("foo"), None, Some("bar")];
1728        let strings2 = [Some("blarg"), Some("blah"), None];
1729
1730        let string_array: ArrayRef =
1731            Arc::new(strings1.iter().cloned().collect::<StringArray>());
1732        let dict_array: ArrayRef = Arc::new(
1733            strings2
1734                .iter()
1735                .cloned()
1736                .collect::<DictionaryArray<Int32Type>>(),
1737        );
1738
1739        let random_state = RandomState::with_seed(0);
1740
1741        let mut one_col_hashes = vec![0; strings1.len()];
1742        create_hashes(
1743            &[Arc::clone(&dict_array) as ArrayRef],
1744            &random_state,
1745            &mut one_col_hashes,
1746        )
1747        .unwrap();
1748
1749        let mut two_col_hashes = vec![0; strings1.len()];
1750        create_hashes(
1751            &[dict_array, string_array],
1752            &random_state,
1753            &mut two_col_hashes,
1754        )
1755        .unwrap();
1756
1757        assert_eq!(one_col_hashes.len(), 3);
1758        assert_eq!(two_col_hashes.len(), 3);
1759
1760        assert_ne!(one_col_hashes, two_col_hashes);
1761    }
1762
1763    #[test]
1764    fn test_create_hashes_from_arrays() {
1765        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
1766        let float_array: ArrayRef =
1767            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
1768
1769        let random_state = RandomState::with_seed(0);
1770        let hashes_buff = &mut vec![0; int_array.len()];
1771        let hashes =
1772            create_hashes(&[int_array, float_array], &random_state, hashes_buff).unwrap();
1773        assert_eq!(hashes.len(), 4,);
1774    }
1775
1776    #[test]
1777    fn test_create_hashes_from_dyn_arrays() {
1778        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
1779        let float_array: ArrayRef =
1780            Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
1781
1782        // Verify that we can call create_hashes with only &dyn Array
1783        fn test(arr1: &dyn Array, arr2: &dyn Array) {
1784            let random_state = RandomState::with_seed(0);
1785            let hashes_buff = &mut vec![0; arr1.len()];
1786            let hashes = create_hashes([arr1, arr2], &random_state, hashes_buff).unwrap();
1787            assert_eq!(hashes.len(), 4,);
1788        }
1789        test(&*int_array, &*float_array);
1790    }
1791
1792    #[test]
1793    fn test_create_hashes_equivalence() {
1794        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
1795        let random_state = RandomState::with_seed(0);
1796
1797        let mut hashes1 = vec![0; array.len()];
1798        create_hashes(
1799            &[Arc::clone(&array) as ArrayRef],
1800            &random_state,
1801            &mut hashes1,
1802        )
1803        .unwrap();
1804
1805        let mut hashes2 = vec![0; array.len()];
1806        create_hashes([array], &random_state, &mut hashes2).unwrap();
1807
1808        assert_eq!(hashes1, hashes2);
1809    }
1810
1811    #[test]
1812    fn test_with_hashes() {
1813        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4]));
1814        let random_state = RandomState::with_seed(0);
1815
1816        // Test that with_hashes produces the same results as create_hashes
1817        let mut expected_hashes = vec![0; array.len()];
1818        create_hashes([&array], &random_state, &mut expected_hashes).unwrap();
1819
1820        let result = with_hashes([&array], &random_state, |hashes| {
1821            assert_eq!(hashes.len(), 4);
1822            // Verify hashes match expected values
1823            assert_eq!(hashes, &expected_hashes[..]);
1824            // Return a copy of the hashes
1825            Ok(hashes.to_vec())
1826        })
1827        .unwrap();
1828
1829        // Verify callback result is returned correctly
1830        assert_eq!(result, expected_hashes);
1831    }
1832
1833    #[test]
1834    fn test_with_hashes_multi_column() {
1835        let int_array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
1836        let str_array: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
1837        let random_state = RandomState::with_seed(0);
1838
1839        // Test multi-column hashing
1840        let mut expected_hashes = vec![0; int_array.len()];
1841        create_hashes(
1842            [&int_array, &str_array],
1843            &random_state,
1844            &mut expected_hashes,
1845        )
1846        .unwrap();
1847
1848        with_hashes([&int_array, &str_array], &random_state, |hashes| {
1849            assert_eq!(hashes.len(), 3);
1850            assert_eq!(hashes, &expected_hashes[..]);
1851            Ok(())
1852        })
1853        .unwrap();
1854    }
1855
1856    #[test]
1857    fn test_with_hashes_empty_arrays() {
1858        let random_state = RandomState::with_seed(0);
1859
1860        // Test that passing no arrays returns an error
1861        let empty: [&ArrayRef; 0] = [];
1862        let result = with_hashes(empty, &random_state, |_hashes| Ok(()));
1863
1864        assert!(result.is_err());
1865        assert!(
1866            result
1867                .unwrap_err()
1868                .to_string()
1869                .contains("requires at least one array")
1870        );
1871    }
1872
1873    #[test]
1874    fn test_with_hashes_reentrancy() {
1875        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3]));
1876        let array2: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6]));
1877        let random_state = RandomState::with_seed(0);
1878
1879        // Test that reentrant calls return an error instead of panicking
1880        let result = with_hashes([&array], &random_state, |_hashes| {
1881            // Try to call with_hashes again inside the callback
1882            with_hashes([&array2], &random_state, |_inner_hashes| Ok(()))
1883        });
1884
1885        assert!(result.is_err());
1886        let err_msg = result.unwrap_err().to_string();
1887        assert!(
1888            err_msg.contains("reentrantly") || err_msg.contains("cannot be called"),
1889            "Error message should mention reentrancy: {err_msg}",
1890        );
1891    }
1892
1893    #[test]
1894    #[cfg(not(feature = "force_hash_collisions"))]
1895    fn create_hashes_for_sparse_union_arrays() {
1896        // logical array: [int(5), str("foo"), int(10), int(5)]
1897        let int_array = Int32Array::from(vec![Some(5), None, Some(10), Some(5)]);
1898        let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
1899
1900        let type_ids = vec![0_i8, 1, 0, 0].into();
1901        let children = vec![
1902            Arc::new(int_array) as ArrayRef,
1903            Arc::new(str_array) as ArrayRef,
1904        ];
1905
1906        let union_fields = [
1907            (0, Arc::new(Field::new("a", DataType::Int32, true))),
1908            (1, Arc::new(Field::new("b", DataType::Utf8, true))),
1909        ]
1910        .into_iter()
1911        .collect();
1912
1913        let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
1914        let array_ref = Arc::new(array) as ArrayRef;
1915
1916        let random_state = RandomState::with_seed(0);
1917        let mut hashes = vec![0; array_ref.len()];
1918        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1919
1920        // Rows 0 and 3 both have type_id=0 (int) with value 5
1921        assert_eq!(hashes[0], hashes[3]);
1922        // Row 0 (int 5) vs Row 2 (int 10) - different values
1923        assert_ne!(hashes[0], hashes[2]);
1924        // Row 0 (int) vs Row 1 (string) - different types
1925        assert_ne!(hashes[0], hashes[1]);
1926    }
1927
1928    #[test]
1929    #[cfg(not(feature = "force_hash_collisions"))]
1930    fn create_hashes_for_sparse_union_arrays_with_nulls() {
1931        // logical array: [int(5), str("foo"), int(null), str(null)]
1932        let int_array = Int32Array::from(vec![Some(5), None, None, None]);
1933        let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
1934
1935        let type_ids = vec![0, 1, 0, 1].into();
1936        let children = vec![
1937            Arc::new(int_array) as ArrayRef,
1938            Arc::new(str_array) as ArrayRef,
1939        ];
1940
1941        let union_fields = [
1942            (0, Arc::new(Field::new("a", DataType::Int32, true))),
1943            (1, Arc::new(Field::new("b", DataType::Utf8, true))),
1944        ]
1945        .into_iter()
1946        .collect();
1947
1948        let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
1949        let array_ref = Arc::new(array) as ArrayRef;
1950
1951        let random_state = RandomState::with_seed(0);
1952        let mut hashes = vec![0; array_ref.len()];
1953        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1954
1955        // row 2 (int null) and row 3 (str null) should have the same hash
1956        // because they are both null values
1957        assert_eq!(hashes[2], hashes[3]);
1958
1959        // row 0 (int 5) vs row 2 (int null) - different (value vs null)
1960        assert_ne!(hashes[0], hashes[2]);
1961
1962        // row 1 (str "foo") vs row 3 (str null) - different (value vs null)
1963        assert_ne!(hashes[1], hashes[3]);
1964    }
1965
1966    #[test]
1967    #[cfg(not(feature = "force_hash_collisions"))]
1968    fn create_hashes_for_dense_union_arrays() {
1969        // creates a dense union array with int and string types
1970        // [67, "norm", 100, "macdonald", 67]
1971        let int_array = Int32Array::from(vec![67, 100, 67]);
1972        let str_array = StringArray::from(vec!["norm", "macdonald"]);
1973
1974        let type_ids = vec![0, 1, 0, 1, 0].into();
1975        let offsets = vec![0, 0, 1, 1, 2].into();
1976        let children = vec![
1977            Arc::new(int_array) as ArrayRef,
1978            Arc::new(str_array) as ArrayRef,
1979        ];
1980
1981        let union_fields = [
1982            (0, Arc::new(Field::new("a", DataType::Int32, false))),
1983            (1, Arc::new(Field::new("b", DataType::Utf8, false))),
1984        ]
1985        .into_iter()
1986        .collect();
1987
1988        let array =
1989            UnionArray::try_new(union_fields, type_ids, Some(offsets), children).unwrap();
1990        let array_ref = Arc::new(array) as ArrayRef;
1991
1992        let random_state = RandomState::with_seed(0);
1993        let mut hashes = vec![0; array_ref.len()];
1994        create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1995
1996        // 67 vs "norm"
1997        assert_ne!(hashes[0], hashes[1]);
1998        // 67 vs 100
1999        assert_ne!(hashes[0], hashes[2]);
2000        // "norm" vs "macdonald"
2001        assert_ne!(hashes[1], hashes[3]);
2002        // 100 vs "macdonald"
2003        assert_ne!(hashes[2], hashes[3]);
2004        // 67 vs 67
2005        assert_eq!(hashes[0], hashes[4]);
2006    }
2007
2008    #[test]
2009    #[cfg(not(feature = "force_hash_collisions"))]
2010    fn create_hashes_for_sliced_run_array() -> Result<()> {
2011        let values = Arc::new(Int32Array::from(vec![10, 20, 30]));
2012        let run_ends = Arc::new(Int32Array::from(vec![2, 5, 7]));
2013        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
2014
2015        let random_state = RandomState::with_seed(0);
2016        let mut full_hashes = vec![0; array.len()];
2017        create_hashes(
2018            &[Arc::clone(&array) as ArrayRef],
2019            &random_state,
2020            &mut full_hashes,
2021        )?;
2022
2023        let array_ref: ArrayRef = Arc::clone(&array) as ArrayRef;
2024        let sliced_array = array_ref.slice(2, 3);
2025
2026        let mut sliced_hashes = vec![0; sliced_array.len()];
2027        create_hashes(
2028            std::slice::from_ref(&sliced_array),
2029            &random_state,
2030            &mut sliced_hashes,
2031        )?;
2032
2033        assert_eq!(sliced_hashes.len(), 3);
2034        assert_eq!(sliced_hashes[0], sliced_hashes[1]);
2035        assert_eq!(sliced_hashes[1], sliced_hashes[2]);
2036        assert_eq!(&sliced_hashes, &full_hashes[2..5]);
2037
2038        Ok(())
2039    }
2040
2041    #[test]
2042    #[cfg(not(feature = "force_hash_collisions"))]
2043    fn test_run_array_with_nulls() -> Result<()> {
2044        let values = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
2045        let run_ends = Arc::new(Int32Array::from(vec![2, 4, 6]));
2046        let array = Arc::new(RunArray::try_new(&run_ends, values.as_ref()).unwrap());
2047
2048        let random_state = RandomState::with_seed(0);
2049        let mut hashes = vec![0; array.len()];
2050        create_hashes(
2051            &[Arc::clone(&array) as ArrayRef],
2052            &random_state,
2053            &mut hashes,
2054        )?;
2055
2056        assert_eq!(hashes[0], hashes[1]);
2057        assert_ne!(hashes[0], 0);
2058        assert_eq!(hashes[2], hashes[3]);
2059        assert_eq!(hashes[2], 0);
2060        assert_eq!(hashes[4], hashes[5]);
2061        assert_ne!(hashes[4], 0);
2062        assert_ne!(hashes[0], hashes[4]);
2063
2064        Ok(())
2065    }
2066
2067    #[test]
2068    #[cfg(not(feature = "force_hash_collisions"))]
2069    fn test_run_array_with_nulls_multicolumn() -> Result<()> {
2070        let primitive_array = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
2071        let run_values = Arc::new(Int32Array::from(vec![Some(10), None, Some(20)]));
2072        let run_ends = Arc::new(Int32Array::from(vec![1, 2, 3]));
2073        let run_array =
2074            Arc::new(RunArray::try_new(&run_ends, run_values.as_ref()).unwrap());
2075        let second_col = Arc::new(Int32Array::from(vec![100, 200, 300]));
2076
2077        let random_state = RandomState::with_seed(0);
2078
2079        let mut primitive_hashes = vec![0; 3];
2080        create_hashes(
2081            &[
2082                Arc::clone(&primitive_array) as ArrayRef,
2083                Arc::clone(&second_col) as ArrayRef,
2084            ],
2085            &random_state,
2086            &mut primitive_hashes,
2087        )?;
2088
2089        let mut run_hashes = vec![0; 3];
2090        create_hashes(
2091            &[
2092                Arc::clone(&run_array) as ArrayRef,
2093                Arc::clone(&second_col) as ArrayRef,
2094            ],
2095            &random_state,
2096            &mut run_hashes,
2097        )?;
2098
2099        assert_eq!(primitive_hashes, run_hashes);
2100
2101        Ok(())
2102    }
2103}