Skip to main content

arrow_array/array/
run_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::any::Any;
19use std::sync::Arc;
20
21use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, RunEndBuffer};
22use arrow_data::{ArrayData, ArrayDataBuilder};
23use arrow_schema::{ArrowError, DataType, Field};
24
25use crate::{
26    Array, ArrayAccessor, ArrayRef, PrimitiveArray,
27    builder::StringRunBuilder,
28    make_array,
29    run_iterator::RunArrayIter,
30    types::{Int16Type, Int32Type, Int64Type, RunEndIndexType},
31};
32
33/// An array of [run-end encoded values].
34///
35/// This encoding is variation on [run-length encoding (RLE)] and is good for representing
36/// data containing the same values repeated consecutively.
37///
38/// A [`RunArray`] consists of a `run_ends` buffer and a `values` array of equivalent
39/// lengths. The `run_ends` buffer stores the indexes at which the run ends. The
40/// `values` array stores the corresponding value of each run. The below example
41/// illustrates how a logical array is represented by a [`RunArray`]:
42///
43/// ```text
44/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┐
45///   ┌─────────────────┐  ┌─────────┐       ┌─────────────────┐
46/// │ │        A        │  │    2    │ │     │        A        │
47///   ├─────────────────┤  ├─────────┤       ├─────────────────┤
48/// │ │        D        │  │    3    │ │     │        A        │    run length of 'A' = runs_ends[0] - 0 = 2
49///   ├─────────────────┤  ├─────────┤       ├─────────────────┤
50/// │ │        B        │  │    6    │ │     │        D        │    run length of 'D' = run_ends[1] - run_ends[0] = 1
51///   └─────────────────┘  └─────────┘       ├─────────────────┤
52/// │        values          run_ends  │     │        B        │
53///                                          ├─────────────────┤
54/// └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─┘     │        B        │
55///                                          ├─────────────────┤
56///                RunArray                  │        B        │    run length of 'B' = run_ends[2] - run_ends[1] = 3
57///               length = 3                 └─────────────────┘
58///
59///                                             Logical array
60///                                                Contents
61/// ```
62///
63/// [run-end encoded values]: https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout
64/// [run-length encoding (RLE)]: https://en.wikipedia.org/wiki/Run-length_encoding
65pub struct RunArray<R: RunEndIndexType> {
66    data_type: DataType,
67    run_ends: RunEndBuffer<R::Native>,
68    values: ArrayRef,
69}
70
71impl<R: RunEndIndexType> Clone for RunArray<R> {
72    fn clone(&self) -> Self {
73        Self {
74            data_type: self.data_type.clone(),
75            run_ends: self.run_ends.clone(),
76            values: self.values.clone(),
77        }
78    }
79}
80
81impl<R: RunEndIndexType> RunArray<R> {
82    /// Calculates the logical length of the array encoded by treating the `run_ends`
83    /// array as if it were a [`RunEndBuffer`].
84    pub fn logical_len(run_ends: &PrimitiveArray<R>) -> usize {
85        let len = run_ends.len();
86        if len == 0 {
87            return 0;
88        }
89        run_ends.value(len - 1).as_usize()
90    }
91
92    /// Attempts to create a [`RunArray`] using the given `run_ends` and `values`.
93    ///
94    /// # Errors
95    ///
96    /// - If `run_ends` and `values` have different lengths
97    /// - If `run_ends` has any null values
98    /// - If `run_ends` doesn't consist of strictly increasing positive integers
99    pub fn try_new(run_ends: &PrimitiveArray<R>, values: &dyn Array) -> Result<Self, ArrowError> {
100        let run_ends_type = run_ends.data_type().clone();
101        let values_type = values.data_type().clone();
102        let ree_array_type = DataType::RunEndEncoded(
103            Arc::new(Field::new("run_ends", run_ends_type, false)),
104            Arc::new(Field::new("values", values_type, true)),
105        );
106        let len = RunArray::logical_len(run_ends);
107        let builder = ArrayDataBuilder::new(ree_array_type)
108            .len(len)
109            .add_child_data(run_ends.to_data())
110            .add_child_data(values.to_data());
111
112        // `build_unchecked` is used to avoid recursive validation of child arrays.
113        let array_data = unsafe { builder.build_unchecked() };
114
115        // Safety: `validate_data` checks below
116        //    1. The given array data has exactly two child arrays.
117        //    2. The first child array (run_ends) has valid data type.
118        //    3. run_ends array does not have null values
119        //    4. run_ends array has non-zero and strictly increasing values.
120        //    5. The length of run_ends array and values array are the same.
121        array_data.validate_data()?;
122
123        Ok(array_data.into())
124    }
125
126    /// Returns a reference to the [`RunEndBuffer`].
127    pub fn run_ends(&self) -> &RunEndBuffer<R::Native> {
128        &self.run_ends
129    }
130
131    /// Returns a reference to the values array.
132    ///
133    /// Any slicing of this [`RunArray`] array is **not** applied to the returned
134    /// values here and must be handled separately.
135    pub fn values(&self) -> &ArrayRef {
136        &self.values
137    }
138
139    /// Returns the physical index at which the array slice starts.
140    ///
141    /// See [`RunEndBuffer::get_start_physical_index`].
142    pub fn get_start_physical_index(&self) -> usize {
143        self.run_ends.get_start_physical_index()
144    }
145
146    /// Returns the physical index at which the array slice ends.
147    ///
148    /// See [`RunEndBuffer::get_end_physical_index`].
149    pub fn get_end_physical_index(&self) -> usize {
150        self.run_ends.get_end_physical_index()
151    }
152
153    /// Downcast this [`RunArray`] to a [`TypedRunArray`]
154    ///
155    /// ```
156    /// use arrow_array::{Array, ArrayAccessor, RunArray, StringArray, types::Int32Type};
157    ///
158    /// let orig = [Some("a"), Some("b"), None];
159    /// let run_array = RunArray::<Int32Type>::from_iter(orig);
160    /// let typed = run_array.downcast::<StringArray>().unwrap();
161    /// assert_eq!(typed.value(0), "a");
162    /// assert_eq!(typed.value(1), "b");
163    /// assert!(typed.values().is_null(2));
164    /// ```
165    pub fn downcast<V: 'static>(&self) -> Option<TypedRunArray<'_, R, V>> {
166        let values = self.values.as_any().downcast_ref()?;
167        Some(TypedRunArray {
168            run_array: self,
169            values,
170        })
171    }
172
173    /// Calls [`RunEndBuffer::get_physical_index`].
174    ///
175    /// The result is arbitrary if `logical_index >= self.len()`
176    pub fn get_physical_index(&self, logical_index: usize) -> usize {
177        self.run_ends.get_physical_index(logical_index)
178    }
179
180    /// Returns the physical indices corresponding to the provided logical indices.
181    ///
182    /// See [`RunEndBuffer::get_physical_indices`] for more details.
183    #[inline]
184    pub fn get_physical_indices<I>(&self, logical_indices: &[I]) -> Result<Vec<usize>, ArrowError>
185    where
186        I: ArrowNativeType,
187    {
188        self.run_ends()
189            .get_physical_indices(logical_indices)
190            .map_err(|index| {
191                ArrowError::InvalidArgumentError(format!(
192                    "Logical index {} is out of bounds for RunArray of length {}",
193                    index.as_usize(),
194                    self.len()
195                ))
196            })
197    }
198
199    /// Returns a zero-copy slice of this array with the indicated offset and length.
200    ///
201    /// # Panics
202    ///
203    /// - Specified slice (`offset` + `length`) exceeds existing length
204    pub fn slice(&self, offset: usize, length: usize) -> Self {
205        Self {
206            data_type: self.data_type.clone(),
207            run_ends: self.run_ends.slice(offset, length),
208            values: self.values.clone(),
209        }
210    }
211}
212
213impl<R: RunEndIndexType> From<ArrayData> for RunArray<R> {
214    // The method assumes the caller already validated the data using `ArrayData::validate_data()`
215    fn from(data: ArrayData) -> Self {
216        match data.data_type() {
217            DataType::RunEndEncoded(_, _) => {}
218            _ => {
219                panic!(
220                    "Invalid data type for RunArray. The data type should be DataType::RunEndEncoded"
221                );
222            }
223        }
224
225        // Safety
226        // ArrayData is valid
227        let child = &data.child_data()[0];
228        assert_eq!(child.data_type(), &R::DATA_TYPE, "Incorrect run ends type");
229        let run_ends = unsafe {
230            let scalar = child.buffers()[0].clone().into();
231            RunEndBuffer::new_unchecked(scalar, data.offset(), data.len())
232        };
233
234        let values = make_array(data.child_data()[1].clone());
235        Self {
236            data_type: data.data_type().clone(),
237            run_ends,
238            values,
239        }
240    }
241}
242
243impl<R: RunEndIndexType> From<RunArray<R>> for ArrayData {
244    fn from(array: RunArray<R>) -> Self {
245        let len = array.run_ends.len();
246        let offset = array.run_ends.offset();
247
248        let run_ends = ArrayDataBuilder::new(R::DATA_TYPE)
249            .len(array.run_ends.values().len())
250            .buffers(vec![array.run_ends.into_inner().into_inner()]);
251
252        let run_ends = unsafe { run_ends.build_unchecked() };
253
254        let builder = ArrayDataBuilder::new(array.data_type)
255            .len(len)
256            .offset(offset)
257            .child_data(vec![run_ends, array.values.to_data()]);
258
259        unsafe { builder.build_unchecked() }
260    }
261}
262
263/// SAFETY: Correctly implements the contract of Arrow Arrays
264unsafe impl<T: RunEndIndexType> Array for RunArray<T> {
265    fn as_any(&self) -> &dyn Any {
266        self
267    }
268
269    fn to_data(&self) -> ArrayData {
270        self.clone().into()
271    }
272
273    fn into_data(self) -> ArrayData {
274        self.into()
275    }
276
277    fn data_type(&self) -> &DataType {
278        &self.data_type
279    }
280
281    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
282        Arc::new(self.slice(offset, length))
283    }
284
285    fn len(&self) -> usize {
286        self.run_ends.len()
287    }
288
289    fn is_empty(&self) -> bool {
290        self.run_ends.is_empty()
291    }
292
293    fn shrink_to_fit(&mut self) {
294        self.run_ends.shrink_to_fit();
295        self.values.shrink_to_fit();
296    }
297
298    fn offset(&self) -> usize {
299        self.run_ends.offset()
300    }
301
302    fn nulls(&self) -> Option<&NullBuffer> {
303        None
304    }
305
306    fn logical_nulls(&self) -> Option<NullBuffer> {
307        let len = self.len();
308        let nulls = self.values.logical_nulls()?;
309        let mut out = BooleanBufferBuilder::new(len);
310        let offset = self.run_ends.offset();
311        let mut valid_start = 0;
312        let mut last_end = 0;
313        for (idx, end) in self.run_ends.values().iter().enumerate() {
314            let end = end.as_usize();
315            if end < offset {
316                continue;
317            }
318            let end = (end - offset).min(len);
319            if nulls.is_null(idx) {
320                if valid_start < last_end {
321                    out.append_n(last_end - valid_start, true);
322                }
323                out.append_n(end - last_end, false);
324                valid_start = end;
325            }
326            last_end = end;
327            if end == len {
328                break;
329            }
330        }
331        if valid_start < len {
332            out.append_n(len - valid_start, true)
333        }
334        // Sanity check
335        assert_eq!(out.len(), len);
336        Some(out.finish().into())
337    }
338
339    fn is_nullable(&self) -> bool {
340        !self.is_empty() && self.values.is_nullable()
341    }
342
343    fn get_buffer_memory_size(&self) -> usize {
344        self.run_ends.inner().inner().capacity() + self.values.get_buffer_memory_size()
345    }
346
347    fn get_array_memory_size(&self) -> usize {
348        std::mem::size_of::<Self>()
349            + self.run_ends.inner().inner().capacity()
350            + self.values.get_array_memory_size()
351    }
352}
353
354impl<R: RunEndIndexType> std::fmt::Debug for RunArray<R> {
355    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
356        writeln!(
357            f,
358            "RunArray {{run_ends: {:?}, values: {:?}}}",
359            self.run_ends.values(),
360            self.values
361        )
362    }
363}
364
365/// Constructs a `RunArray` from an iterator of optional strings.
366///
367/// # Example:
368/// ```
369/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type};
370///
371/// let test = vec!["a", "a", "b", "c", "c"];
372/// let array: RunArray<Int16Type> = test
373///     .iter()
374///     .map(|&x| if x == "b" { None } else { Some(x) })
375///     .collect();
376/// assert_eq!(
377///     "RunArray {run_ends: [2, 3, 5], values: StringArray\n[\n  \"a\",\n  null,\n  \"c\",\n]}\n",
378///     format!("{:?}", array)
379/// );
380/// ```
381impl<'a, T: RunEndIndexType> FromIterator<Option<&'a str>> for RunArray<T> {
382    fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
383        let it = iter.into_iter();
384        let (lower, _) = it.size_hint();
385        let mut builder = StringRunBuilder::with_capacity(lower, 256);
386        it.for_each(|i| {
387            builder.append_option(i);
388        });
389
390        builder.finish()
391    }
392}
393
394/// Constructs a `RunArray` from an iterator of strings.
395///
396/// # Example:
397///
398/// ```
399/// use arrow_array::{RunArray, PrimitiveArray, StringArray, types::Int16Type};
400///
401/// let test = vec!["a", "a", "b", "c"];
402/// let array: RunArray<Int16Type> = test.into_iter().collect();
403/// assert_eq!(
404///     "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
405///     format!("{:?}", array)
406/// );
407/// ```
408impl<'a, T: RunEndIndexType> FromIterator<&'a str> for RunArray<T> {
409    fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
410        let it = iter.into_iter();
411        let (lower, _) = it.size_hint();
412        let mut builder = StringRunBuilder::with_capacity(lower, 256);
413        it.for_each(|i| {
414            builder.append_value(i);
415        });
416
417        builder.finish()
418    }
419}
420
421///
422/// A [`RunArray`] with `i16` run ends
423///
424/// # Example: Using `collect`
425/// ```
426/// # use arrow_array::{Array, Int16RunArray, Int16Array, StringArray};
427/// # use std::sync::Arc;
428///
429/// let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
430/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
431/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
432/// assert_eq!(array.values(), &values);
433/// ```
434pub type Int16RunArray = RunArray<Int16Type>;
435
436///
437/// A [`RunArray`] with `i32` run ends
438///
439/// # Example: Using `collect`
440/// ```
441/// # use arrow_array::{Array, Int32RunArray, Int32Array, StringArray};
442/// # use std::sync::Arc;
443///
444/// let array: Int32RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
445/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
446/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
447/// assert_eq!(array.values(), &values);
448/// ```
449pub type Int32RunArray = RunArray<Int32Type>;
450
451///
452/// A [`RunArray`] with `i64` run ends
453///
454/// # Example: Using `collect`
455/// ```
456/// # use arrow_array::{Array, Int64RunArray, Int64Array, StringArray};
457/// # use std::sync::Arc;
458///
459/// let array: Int64RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
460/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
461/// assert_eq!(array.run_ends().values(), &[2, 3, 5]);
462/// assert_eq!(array.values(), &values);
463/// ```
464pub type Int64RunArray = RunArray<Int64Type>;
465
466/// A [`RunArray`] typed typed on its child values array
467///
468/// Implements [`ArrayAccessor`] and [`IntoIterator`] allowing fast access to its elements
469///
470/// ```
471/// use arrow_array::{RunArray, StringArray, types::Int32Type};
472///
473/// let orig = ["a", "b", "a", "b"];
474/// let ree_array = RunArray::<Int32Type>::from_iter(orig);
475///
476/// // `TypedRunArray` allows you to access the values directly
477/// let typed = ree_array.downcast::<StringArray>().unwrap();
478///
479/// for (maybe_val, orig) in typed.into_iter().zip(orig) {
480///     assert_eq!(maybe_val.unwrap(), orig)
481/// }
482/// ```
483pub struct TypedRunArray<'a, R: RunEndIndexType, V> {
484    /// The run array
485    run_array: &'a RunArray<R>,
486
487    /// The values of the run_array
488    values: &'a V,
489}
490
491// Manually implement `Clone` to avoid `V: Clone` type constraint
492impl<R: RunEndIndexType, V> Clone for TypedRunArray<'_, R, V> {
493    fn clone(&self) -> Self {
494        *self
495    }
496}
497
498impl<R: RunEndIndexType, V> Copy for TypedRunArray<'_, R, V> {}
499
500impl<R: RunEndIndexType, V> std::fmt::Debug for TypedRunArray<'_, R, V> {
501    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
502        writeln!(f, "TypedRunArray({:?})", self.run_array)
503    }
504}
505
506impl<'a, R: RunEndIndexType, V> TypedRunArray<'a, R, V> {
507    /// Returns the run_ends of this [`TypedRunArray`]
508    pub fn run_ends(&self) -> &'a RunEndBuffer<R::Native> {
509        self.run_array.run_ends()
510    }
511
512    /// Returns the values of this [`TypedRunArray`]
513    pub fn values(&self) -> &'a V {
514        self.values
515    }
516
517    /// Returns the run array of this [`TypedRunArray`]
518    pub fn run_array(&self) -> &'a RunArray<R> {
519        self.run_array
520    }
521}
522
523/// SAFETY: Correctly implements the contract of Arrow Arrays
524unsafe impl<R: RunEndIndexType, V: Sync> Array for TypedRunArray<'_, R, V> {
525    fn as_any(&self) -> &dyn Any {
526        self.run_array
527    }
528
529    fn to_data(&self) -> ArrayData {
530        self.run_array.to_data()
531    }
532
533    fn into_data(self) -> ArrayData {
534        self.run_array.into_data()
535    }
536
537    fn data_type(&self) -> &DataType {
538        self.run_array.data_type()
539    }
540
541    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
542        Arc::new(self.run_array.slice(offset, length))
543    }
544
545    fn len(&self) -> usize {
546        self.run_array.len()
547    }
548
549    fn is_empty(&self) -> bool {
550        self.run_array.is_empty()
551    }
552
553    fn offset(&self) -> usize {
554        self.run_array.offset()
555    }
556
557    fn nulls(&self) -> Option<&NullBuffer> {
558        self.run_array.nulls()
559    }
560
561    fn logical_nulls(&self) -> Option<NullBuffer> {
562        self.run_array.logical_nulls()
563    }
564
565    fn logical_null_count(&self) -> usize {
566        self.run_array.logical_null_count()
567    }
568
569    fn is_nullable(&self) -> bool {
570        self.run_array.is_nullable()
571    }
572
573    fn get_buffer_memory_size(&self) -> usize {
574        self.run_array.get_buffer_memory_size()
575    }
576
577    fn get_array_memory_size(&self) -> usize {
578        self.run_array.get_array_memory_size()
579    }
580}
581
582// Array accessor converts the index of logical array to the index of the physical array
583// using binary search. The time complexity is O(log N) where N is number of runs.
584impl<'a, R, V> ArrayAccessor for TypedRunArray<'a, R, V>
585where
586    R: RunEndIndexType,
587    V: Sync + Send,
588    &'a V: ArrayAccessor,
589    <&'a V as ArrayAccessor>::Item: Default,
590{
591    type Item = <&'a V as ArrayAccessor>::Item;
592
593    fn value(&self, logical_index: usize) -> Self::Item {
594        assert!(
595            logical_index < self.len(),
596            "Trying to access an element at index {} from a TypedRunArray of length {}",
597            logical_index,
598            self.len()
599        );
600        unsafe { self.value_unchecked(logical_index) }
601    }
602
603    unsafe fn value_unchecked(&self, logical_index: usize) -> Self::Item {
604        let physical_index = self.run_array.get_physical_index(logical_index);
605        unsafe { self.values().value_unchecked(physical_index) }
606    }
607}
608
609impl<'a, R, V> IntoIterator for TypedRunArray<'a, R, V>
610where
611    R: RunEndIndexType,
612    V: Sync + Send,
613    &'a V: ArrayAccessor,
614    <&'a V as ArrayAccessor>::Item: Default,
615{
616    type Item = Option<<&'a V as ArrayAccessor>::Item>;
617    type IntoIter = RunArrayIter<'a, R, V>;
618
619    fn into_iter(self) -> Self::IntoIter {
620        RunArrayIter::new(self)
621    }
622}
623
624#[cfg(test)]
625mod tests {
626    use rand::Rng;
627    use rand::rng;
628    use rand::seq::SliceRandom;
629
630    use super::*;
631    use crate::builder::PrimitiveRunBuilder;
632    use crate::cast::AsArray;
633    use crate::types::{Int8Type, UInt32Type};
634    use crate::{Int16Array, Int32Array, StringArray};
635
636    fn build_input_array(size: usize) -> Vec<Option<i32>> {
637        // The input array is created by shuffling and repeating
638        // the seed values random number of times.
639        let mut seed: Vec<Option<i32>> = vec![
640            None,
641            None,
642            None,
643            Some(1),
644            Some(2),
645            Some(3),
646            Some(4),
647            Some(5),
648            Some(6),
649            Some(7),
650            Some(8),
651            Some(9),
652        ];
653        let mut result: Vec<Option<i32>> = Vec::with_capacity(size);
654        let mut ix = 0;
655        let mut rng = rng();
656        // run length can go up to 8. Cap the max run length for smaller arrays to size / 2.
657        let max_run_length = 8_usize.min(1_usize.max(size / 2));
658        while result.len() < size {
659            // shuffle the seed array if all the values are iterated.
660            if ix == 0 {
661                seed.shuffle(&mut rng);
662            }
663            // repeat the items between 1 and 8 times. Cap the length for smaller sized arrays
664            let num = max_run_length.min(rng.random_range(1..=max_run_length));
665            for _ in 0..num {
666                result.push(seed[ix]);
667            }
668            ix += 1;
669            if ix == seed.len() {
670                ix = 0
671            }
672        }
673        result.resize(size, None);
674        result
675    }
676
677    // Asserts that `logical_array[logical_indices[*]] == physical_array[physical_indices[*]]`
678    fn compare_logical_and_physical_indices(
679        logical_indices: &[u32],
680        logical_array: &[Option<i32>],
681        physical_indices: &[usize],
682        physical_array: &PrimitiveArray<Int32Type>,
683    ) {
684        assert_eq!(logical_indices.len(), physical_indices.len());
685
686        // check value in logical index in the logical_array matches physical index in physical_array
687        logical_indices
688            .iter()
689            .map(|f| f.as_usize())
690            .zip(physical_indices.iter())
691            .for_each(|(logical_ix, physical_ix)| {
692                let expected = logical_array[logical_ix];
693                match expected {
694                    Some(val) => {
695                        assert!(physical_array.is_valid(*physical_ix));
696                        let actual = physical_array.value(*physical_ix);
697                        assert_eq!(val, actual);
698                    }
699                    None => {
700                        assert!(physical_array.is_null(*physical_ix))
701                    }
702                };
703            });
704    }
705    #[test]
706    fn test_run_array() {
707        // Construct a value array
708        let value_data =
709            PrimitiveArray::<Int8Type>::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]);
710
711        // Construct a run_ends array:
712        let run_ends_values = [4_i16, 6, 7, 9, 13, 18, 20, 22];
713        let run_ends_data =
714            PrimitiveArray::<Int16Type>::from_iter_values(run_ends_values.iter().copied());
715
716        // Construct a run ends encoded array from the above two
717        let ree_array = RunArray::<Int16Type>::try_new(&run_ends_data, &value_data).unwrap();
718
719        assert_eq!(ree_array.len(), 22);
720        assert_eq!(ree_array.null_count(), 0);
721
722        let values = ree_array.values();
723        assert_eq!(value_data.into_data(), values.to_data());
724        assert_eq!(&DataType::Int8, values.data_type());
725
726        let run_ends = ree_array.run_ends();
727        assert_eq!(run_ends.values(), &run_ends_values);
728    }
729
730    #[test]
731    fn test_run_array_fmt_debug() {
732        let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(3);
733        builder.append_value(12345678);
734        builder.append_null();
735        builder.append_value(22345678);
736        let array = builder.finish();
737        assert_eq!(
738            "RunArray {run_ends: [1, 2, 3], values: PrimitiveArray<UInt32>\n[\n  12345678,\n  null,\n  22345678,\n]}\n",
739            format!("{array:?}")
740        );
741
742        let mut builder = PrimitiveRunBuilder::<Int16Type, UInt32Type>::with_capacity(20);
743        for _ in 0..20 {
744            builder.append_value(1);
745        }
746        let array = builder.finish();
747
748        assert_eq!(array.len(), 20);
749        assert_eq!(array.null_count(), 0);
750        assert_eq!(array.logical_null_count(), 0);
751
752        assert_eq!(
753            "RunArray {run_ends: [20], values: PrimitiveArray<UInt32>\n[\n  1,\n]}\n",
754            format!("{array:?}")
755        );
756    }
757
758    #[test]
759    fn test_run_array_from_iter() {
760        let test = vec!["a", "a", "b", "c"];
761        let array: RunArray<Int16Type> = test
762            .iter()
763            .map(|&x| if x == "b" { None } else { Some(x) })
764            .collect();
765        assert_eq!(
766            "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  null,\n  \"c\",\n]}\n",
767            format!("{array:?}")
768        );
769
770        assert_eq!(array.len(), 4);
771        assert_eq!(array.null_count(), 0);
772        assert_eq!(array.logical_null_count(), 1);
773
774        let array: RunArray<Int16Type> = test.into_iter().collect();
775        assert_eq!(
776            "RunArray {run_ends: [2, 3, 4], values: StringArray\n[\n  \"a\",\n  \"b\",\n  \"c\",\n]}\n",
777            format!("{array:?}")
778        );
779    }
780
781    #[test]
782    fn test_run_array_run_ends_as_primitive_array() {
783        let test = vec!["a", "b", "c", "a"];
784        let array: RunArray<Int16Type> = test.into_iter().collect();
785
786        assert_eq!(array.len(), 4);
787        assert_eq!(array.null_count(), 0);
788        assert_eq!(array.logical_null_count(), 0);
789
790        let run_ends = array.run_ends();
791        assert_eq!(&[1, 2, 3, 4], run_ends.values());
792    }
793
794    #[test]
795    fn test_run_array_as_primitive_array_with_null() {
796        let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
797        let array: RunArray<Int32Type> = test.into_iter().collect();
798
799        assert_eq!(array.len(), 6);
800        assert_eq!(array.null_count(), 0);
801        assert_eq!(array.logical_null_count(), 3);
802
803        let run_ends = array.run_ends();
804        assert_eq!(&[1, 2, 3, 5, 6], run_ends.values());
805
806        let values_data = array.values();
807        assert_eq!(2, values_data.null_count());
808        assert_eq!(5, values_data.len());
809    }
810
811    #[test]
812    fn test_run_array_all_nulls() {
813        let test = vec![None, None, None];
814        let array: RunArray<Int32Type> = test.into_iter().collect();
815
816        assert_eq!(array.len(), 3);
817        assert_eq!(array.null_count(), 0);
818        assert_eq!(array.logical_null_count(), 3);
819
820        let run_ends = array.run_ends();
821        assert_eq!(3, run_ends.len());
822        assert_eq!(&[3], run_ends.values());
823
824        let values_data = array.values();
825        assert_eq!(1, values_data.null_count());
826    }
827
828    #[test]
829    fn test_run_array_try_new() {
830        let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")]
831            .into_iter()
832            .collect();
833        let run_ends: Int32Array = [Some(1), Some(2), Some(3), Some(4)].into_iter().collect();
834
835        let array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
836        assert_eq!(array.values().data_type(), &DataType::Utf8);
837
838        assert_eq!(array.null_count(), 0);
839        assert_eq!(array.logical_null_count(), 1);
840        assert_eq!(array.len(), 4);
841        assert_eq!(array.values().null_count(), 1);
842
843        assert_eq!(
844            "RunArray {run_ends: [1, 2, 3, 4], values: StringArray\n[\n  \"foo\",\n  \"bar\",\n  null,\n  \"baz\",\n]}\n",
845            format!("{array:?}")
846        );
847    }
848
849    #[test]
850    fn test_run_array_int16_type_definition() {
851        let array: Int16RunArray = vec!["a", "a", "b", "c", "c"].into_iter().collect();
852        let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
853        assert_eq!(array.run_ends().values(), &[2, 3, 5]);
854        assert_eq!(array.values(), &values);
855    }
856
857    #[test]
858    fn test_run_array_empty_string() {
859        let array: Int16RunArray = vec!["a", "a", "", "", "c"].into_iter().collect();
860        let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "", "c"]));
861        assert_eq!(array.run_ends().values(), &[2, 4, 5]);
862        assert_eq!(array.values(), &values);
863    }
864
865    #[test]
866    fn test_run_array_length_mismatch() {
867        let values: StringArray = [Some("foo"), Some("bar"), None, Some("baz")]
868            .into_iter()
869            .collect();
870        let run_ends: Int32Array = [Some(1), Some(2), Some(3)].into_iter().collect();
871
872        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
873        let expected = ArrowError::InvalidArgumentError("The run_ends array length should be the same as values array length. Run_ends array length is 3, values array length is 4".to_string());
874        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
875    }
876
877    #[test]
878    fn test_run_array_run_ends_with_null() {
879        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
880            .into_iter()
881            .collect();
882        let run_ends: Int32Array = [Some(1), None, Some(3)].into_iter().collect();
883
884        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
885        let expected = ArrowError::InvalidArgumentError(
886            "Found null values in run_ends array. The run_ends array should not have null values."
887                .to_string(),
888        );
889        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
890    }
891
892    #[test]
893    fn test_run_array_run_ends_with_zeroes() {
894        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
895            .into_iter()
896            .collect();
897        let run_ends: Int32Array = [Some(0), Some(1), Some(3)].into_iter().collect();
898
899        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
900        let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly positive. Found value 0 at index 0 that does not match the criteria.".to_string());
901        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
902    }
903
904    #[test]
905    fn test_run_array_run_ends_non_increasing() {
906        let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
907            .into_iter()
908            .collect();
909        let run_ends: Int32Array = [Some(1), Some(4), Some(4)].into_iter().collect();
910
911        let actual = RunArray::<Int32Type>::try_new(&run_ends, &values);
912        let expected = ArrowError::InvalidArgumentError("The values in run_ends array should be strictly increasing. Found value 4 at index 2 with previous value 4 that does not match the criteria.".to_string());
913        assert_eq!(expected.to_string(), actual.err().unwrap().to_string());
914    }
915
916    #[test]
917    #[should_panic(expected = "Incorrect run ends type")]
918    fn test_run_array_run_ends_data_type_mismatch() {
919        let a = RunArray::<Int32Type>::from_iter(["32"]);
920        let _ = RunArray::<Int64Type>::from(a.into_data());
921    }
922
923    #[test]
924    fn test_ree_array_accessor() {
925        let input_array = build_input_array(256);
926
927        // Encode the input_array to ree_array
928        let mut builder =
929            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
930        builder.extend(input_array.iter().copied());
931        let run_array = builder.finish();
932        let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
933
934        // Access every index and check if the value in the input array matches returned value.
935        for (i, inp_val) in input_array.iter().enumerate() {
936            if let Some(val) = inp_val {
937                let actual = typed.value(i);
938                assert_eq!(*val, actual)
939            } else {
940                let physical_ix = run_array.get_physical_index(i);
941                assert!(typed.values().is_null(physical_ix));
942            };
943        }
944    }
945
946    #[test]
947    #[cfg_attr(miri, ignore)] // Takes too long
948    fn test_get_physical_indices() {
949        // Test for logical lengths starting from 10 to 250 increasing by 10
950        for logical_len in (0..250).step_by(10) {
951            let input_array = build_input_array(logical_len);
952
953            // create run array using input_array
954            let mut builder = PrimitiveRunBuilder::<Int32Type, Int32Type>::new();
955            builder.extend(input_array.clone().into_iter());
956
957            let run_array = builder.finish();
958            let physical_values_array = run_array.values().as_primitive::<Int32Type>();
959
960            // create an array consisting of all the indices repeated twice and shuffled.
961            let mut logical_indices: Vec<u32> = (0_u32..(logical_len as u32)).collect();
962            // add same indices once more
963            logical_indices.append(&mut logical_indices.clone());
964            let mut rng = rng();
965            logical_indices.shuffle(&mut rng);
966
967            let physical_indices = run_array.get_physical_indices(&logical_indices).unwrap();
968
969            assert_eq!(logical_indices.len(), physical_indices.len());
970
971            // check value in logical index in the input_array matches physical index in typed_run_array
972            compare_logical_and_physical_indices(
973                &logical_indices,
974                &input_array,
975                &physical_indices,
976                physical_values_array,
977            );
978        }
979    }
980
981    #[test]
982    #[cfg_attr(miri, ignore)] // Takes too long
983    fn test_get_physical_indices_sliced() {
984        let total_len = 80;
985        let input_array = build_input_array(total_len);
986
987        // Encode the input_array to run array
988        let mut builder =
989            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
990        builder.extend(input_array.iter().copied());
991        let run_array = builder.finish();
992        let physical_values_array = run_array.values().as_primitive::<Int32Type>();
993
994        // test for all slice lengths.
995        for slice_len in 1..=total_len {
996            // create an array consisting of all the indices repeated twice and shuffled.
997            let mut logical_indices: Vec<u32> = (0_u32..(slice_len as u32)).collect();
998            // add same indices once more
999            logical_indices.append(&mut logical_indices.clone());
1000            let mut rng = rng();
1001            logical_indices.shuffle(&mut rng);
1002
1003            // test for offset = 0 and slice length = slice_len
1004            // slice the input array using which the run array was built.
1005            let sliced_input_array = &input_array[0..slice_len];
1006
1007            // slice the run array
1008            let sliced_run_array: RunArray<Int16Type> =
1009                run_array.slice(0, slice_len).into_data().into();
1010
1011            // Get physical indices.
1012            let physical_indices = sliced_run_array
1013                .get_physical_indices(&logical_indices)
1014                .unwrap();
1015
1016            compare_logical_and_physical_indices(
1017                &logical_indices,
1018                sliced_input_array,
1019                &physical_indices,
1020                physical_values_array,
1021            );
1022
1023            // test for offset = total_len - slice_len and slice length = slice_len
1024            // slice the input array using which the run array was built.
1025            let sliced_input_array = &input_array[total_len - slice_len..total_len];
1026
1027            // slice the run array
1028            let sliced_run_array: RunArray<Int16Type> = run_array
1029                .slice(total_len - slice_len, slice_len)
1030                .into_data()
1031                .into();
1032
1033            // Get physical indices
1034            let physical_indices = sliced_run_array
1035                .get_physical_indices(&logical_indices)
1036                .unwrap();
1037
1038            compare_logical_and_physical_indices(
1039                &logical_indices,
1040                sliced_input_array,
1041                &physical_indices,
1042                physical_values_array,
1043            );
1044        }
1045    }
1046
1047    #[test]
1048    fn test_logical_nulls() {
1049        let run = Int32Array::from(vec![3, 6, 9, 12]);
1050        let values = Int32Array::from(vec![Some(0), None, Some(1), None]);
1051        let array = RunArray::try_new(&run, &values).unwrap();
1052
1053        let expected = [
1054            true, true, true, false, false, false, true, true, true, false, false, false,
1055        ];
1056
1057        let n = array.logical_nulls().unwrap();
1058        assert_eq!(n.null_count(), 6);
1059
1060        let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)];
1061        for (offset, length) in slices {
1062            let a = array.slice(offset, length);
1063            let n = a.logical_nulls().unwrap();
1064            let n = n.into_iter().collect::<Vec<_>>();
1065            assert_eq!(&n, &expected[offset..offset + length], "{offset} {length}");
1066        }
1067    }
1068
1069    #[test]
1070    fn test_run_array_eq_identical() {
1071        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1072        let values1 = StringArray::from(vec!["a", "b", "c"]);
1073        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1074
1075        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1076        let values2 = StringArray::from(vec!["a", "b", "c"]);
1077        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1078
1079        assert_eq!(array1, array2);
1080    }
1081
1082    #[test]
1083    fn test_run_array_ne_different_run_ends() {
1084        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1085        let values1 = StringArray::from(vec!["a", "b", "c"]);
1086        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1087
1088        let run_ends2 = Int32Array::from(vec![1, 4, 6]);
1089        let values2 = StringArray::from(vec!["a", "b", "c"]);
1090        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1091
1092        assert_ne!(array1, array2);
1093    }
1094
1095    #[test]
1096    fn test_run_array_ne_different_values() {
1097        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1098        let values1 = StringArray::from(vec!["a", "b", "c"]);
1099        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1100
1101        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1102        let values2 = StringArray::from(vec!["a", "b", "d"]);
1103        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1104
1105        assert_ne!(array1, array2);
1106    }
1107
1108    #[test]
1109    fn test_run_array_eq_with_nulls() {
1110        let run_ends1 = Int32Array::from(vec![2, 4, 6]);
1111        let values1 = StringArray::from(vec![Some("a"), None, Some("c")]);
1112        let array1 = RunArray::<Int32Type>::try_new(&run_ends1, &values1).unwrap();
1113
1114        let run_ends2 = Int32Array::from(vec![2, 4, 6]);
1115        let values2 = StringArray::from(vec![Some("a"), None, Some("c")]);
1116        let array2 = RunArray::<Int32Type>::try_new(&run_ends2, &values2).unwrap();
1117
1118        assert_eq!(array1, array2);
1119    }
1120
1121    #[test]
1122    fn test_run_array_eq_different_run_end_types() {
1123        let run_ends_i16_1 = Int16Array::from(vec![2_i16, 4, 6]);
1124        let values_i16_1 = StringArray::from(vec!["a", "b", "c"]);
1125        let array_i16_1 = RunArray::<Int16Type>::try_new(&run_ends_i16_1, &values_i16_1).unwrap();
1126
1127        let run_ends_i16_2 = Int16Array::from(vec![2_i16, 4, 6]);
1128        let values_i16_2 = StringArray::from(vec!["a", "b", "c"]);
1129        let array_i16_2 = RunArray::<Int16Type>::try_new(&run_ends_i16_2, &values_i16_2).unwrap();
1130
1131        assert_eq!(array_i16_1, array_i16_2);
1132    }
1133}