Skip to main content

arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138        }
139        DataType::Dictionary(k, _) => [
140            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141            empty_buffer,
142        ],
143        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144            [empty_buffer, MutableBuffer::new(0)]
145        }
146        DataType::Union(_, mode) => {
147            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148            match mode {
149                UnionMode::Sparse => [type_ids, empty_buffer],
150                UnionMode::Dense => {
151                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152                    [type_ids, offsets]
153                }
154            }
155        }
156    }
157}
158
159/// A generic representation of Arrow array data which encapsulates common attributes
160/// and operations for Arrow array.
161///
162/// Specific operations for different arrays types (e.g., primitive, list, struct)
163/// are implemented in `Array`.
164///
165/// # Memory Layout
166///
167/// `ArrayData` has references to one or more underlying data buffers
168/// and optional child ArrayData, depending on type as illustrated
169/// below. Bitmaps are not shown for simplicity but they are stored
170/// similarly to the buffers.
171///
172/// ```text
173///                        offset
174///                       points to
175/// ┌───────────────────┐ start of  ┌───────┐       Different
176/// │                   │   data    │       │     ArrayData may
177/// │ArrayData {        │           │....   │     also refers to
178/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181/// │  buffers: [       │           │5882   │◀─
182/// │    ...            │  │        │4323   │
183/// │  ]                │   ─ ─ ─ ─▶│4859   │
184/// │  child_data: [    │           │....   │
185/// │    ...            │           │       │
186/// │  ]                │           └───────┘
187/// │}                  │
188/// │                   │            Shared Buffer uses
189/// │               │   │            bytes::Bytes to hold
190/// └───────────────────┘            actual data values
191///           ┌ ─ ─ ┘
192///
193///           ▼
194/// ┌───────────────────┐
195/// │ArrayData {        │
196/// │  ...              │
197/// │}                  │
198/// │                   │
199/// └───────────────────┘
200///
201/// Child ArrayData may also have its own buffers and children
202/// ```
203
204#[derive(Debug, Clone)]
205pub struct ArrayData {
206    /// The data type
207    data_type: DataType,
208
209    /// The number of elements
210    len: usize,
211
212    /// The offset in number of items (not bytes).
213    ///
214    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215    /// does NOT apply to [`Self::nulls`].
216    offset: usize,
217
218    /// The buffers that store the actual data for this array, as defined
219    /// in the [Arrow Spec].
220    ///
221    /// Depending on the array types, [`Self::buffers`] can hold different
222    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223    /// positions.
224    ///
225    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227    /// we need to satisfy `len`.
228    ///
229    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230    buffers: Vec<Buffer>,
231
232    /// The child(ren) of this array.
233    ///
234    /// Only non-empty for nested types, such as `ListArray` and
235    /// `StructArray`.
236    ///
237    /// The first logical element in each child element begins at `offset`.
238    ///
239    /// If the child element also has an offset then these offsets are
240    /// cumulative.
241    child_data: Vec<ArrayData>,
242
243    /// The null bitmap.
244    ///
245    /// `None` indicates all values are non-null in this array.
246    ///
247    /// [`Self::offset]` does not apply to the null bitmap. While the
248    /// BooleanBuffer may be sliced (have its own offset) internally, this
249    /// `NullBuffer` always represents exactly `len` elements.
250    nulls: Option<NullBuffer>,
251}
252
253/// A thread-safe, shared reference to the Arrow array data.
254pub type ArrayDataRef = Arc<ArrayData>;
255
256fn checked_len_plus_offset(
257    data_type: &DataType,
258    len: usize,
259    offset: usize,
260) -> Result<usize, ArrowError> {
261    len.checked_add(offset).ok_or_else(|| {
262        ArrowError::InvalidArgumentError(format!(
263            "Length {len} with offset {offset} overflows usize for {data_type}"
264        ))
265    })
266}
267
268impl ArrayData {
269    /// Create a new ArrayData instance;
270    ///
271    /// If `null_count` is not specified, the number of nulls in
272    /// null_bit_buffer is calculated.
273    ///
274    /// If the number of nulls is 0 then the null_bit_buffer
275    /// is set to `None`.
276    ///
277    /// # Safety
278    ///
279    /// The input values *must* form a valid Arrow array for
280    /// `data_type`, or undefined behavior can result.
281    ///
282    /// Note: This is a low level API and most users of the arrow
283    /// crate should create arrays using the methods in the `array`
284    /// module.
285    pub unsafe fn new_unchecked(
286        data_type: DataType,
287        len: usize,
288        null_count: Option<usize>,
289        null_bit_buffer: Option<Buffer>,
290        offset: usize,
291        buffers: Vec<Buffer>,
292        child_data: Vec<ArrayData>,
293    ) -> Self {
294        let mut skip_validation = UnsafeFlag::new();
295        // SAFETY: caller responsible for ensuring data is valid
296        unsafe { skip_validation.set(true) };
297
298        ArrayDataBuilder {
299            data_type,
300            len,
301            null_count,
302            null_bit_buffer,
303            nulls: None,
304            offset,
305            buffers,
306            child_data,
307            align_buffers: false,
308            skip_validation,
309        }
310        .build()
311        .unwrap()
312    }
313
314    /// Create a new ArrayData, validating that the provided buffers form a valid
315    /// Arrow array of the specified data type.
316    ///
317    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
318    /// is set to `None`.
319    ///
320    /// Internally this calls through to [`Self::validate_data`]
321    ///
322    /// Note: This is a low level API and most users of the arrow crate should create
323    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
324    pub fn try_new(
325        data_type: DataType,
326        len: usize,
327        null_bit_buffer: Option<Buffer>,
328        offset: usize,
329        buffers: Vec<Buffer>,
330        child_data: Vec<ArrayData>,
331    ) -> Result<Self, ArrowError> {
332        // we must check the length of `null_bit_buffer` first
333        // because we use this buffer to calculate `null_count`
334        // in `Self::new_unchecked`.
335        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
336            let len_plus_offset = checked_len_plus_offset(&data_type, len, offset)?;
337            let needed_len = bit_util::ceil(len_plus_offset, 8);
338            if null_bit_buffer.len() < needed_len {
339                return Err(ArrowError::InvalidArgumentError(format!(
340                    "null_bit_buffer size too small. got {} needed {}",
341                    null_bit_buffer.len(),
342                    needed_len
343                )));
344            }
345        }
346        // Safety justification: `validate_full` is called below
347        let new_self = unsafe {
348            Self::new_unchecked(
349                data_type,
350                len,
351                None,
352                null_bit_buffer,
353                offset,
354                buffers,
355                child_data,
356            )
357        };
358
359        // As the data is not trusted, do a full validation of its contents
360        // We don't need to validate children as we can assume that the
361        // [`ArrayData`] in `child_data` have already been validated through
362        // a call to `ArrayData::try_new` or created using unsafe
363        new_self.validate_data()?;
364        Ok(new_self)
365    }
366
367    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
368    #[inline]
369    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
370        ArrayDataBuilder::new(data_type)
371    }
372
373    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
374    #[inline]
375    pub const fn data_type(&self) -> &DataType {
376        &self.data_type
377    }
378
379    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
380    pub fn buffers(&self) -> &[Buffer] {
381        &self.buffers
382    }
383
384    /// Returns a slice of children [`ArrayData`]. This will be non
385    /// empty for type such as lists and structs.
386    pub fn child_data(&self) -> &[ArrayData] {
387        &self.child_data[..]
388    }
389
390    /// Returns whether the element at index `i` is null
391    #[inline]
392    pub fn is_null(&self, i: usize) -> bool {
393        match &self.nulls {
394            Some(v) => v.is_null(i),
395            None => false,
396        }
397    }
398
399    /// Returns a reference to the null buffer of this [`ArrayData`] if any
400    ///
401    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
402    #[inline]
403    pub fn nulls(&self) -> Option<&NullBuffer> {
404        self.nulls.as_ref()
405    }
406
407    /// Returns whether the element at index `i` is not null
408    #[inline]
409    pub fn is_valid(&self, i: usize) -> bool {
410        !self.is_null(i)
411    }
412
413    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
414    #[inline]
415    pub const fn len(&self) -> usize {
416        self.len
417    }
418
419    /// Returns whether this [`ArrayData`] is empty
420    #[inline]
421    pub const fn is_empty(&self) -> bool {
422        self.len == 0
423    }
424
425    /// Returns the offset of this [`ArrayData`]
426    #[inline]
427    pub const fn offset(&self) -> usize {
428        self.offset
429    }
430
431    /// Returns the total number of nulls in this array
432    #[inline]
433    pub fn null_count(&self) -> usize {
434        self.nulls
435            .as_ref()
436            .map(|x| x.null_count())
437            .unwrap_or_default()
438    }
439
440    /// Returns the total number of bytes of memory occupied by the
441    /// buffers owned by this [`ArrayData`] and all of its
442    /// children. (See also diagram on [`ArrayData`]).
443    ///
444    /// Note that this [`ArrayData`] may only refer to a subset of the
445    /// data in the underlying [`Buffer`]s (due to `offset` and
446    /// `length`), but the size returned includes the entire size of
447    /// the buffers.
448    ///
449    /// If multiple [`ArrayData`]s refer to the same underlying
450    /// [`Buffer`]s they will both report the same size.
451    pub fn get_buffer_memory_size(&self) -> usize {
452        let mut size = 0;
453        for buffer in &self.buffers {
454            size += buffer.capacity();
455        }
456        if let Some(bitmap) = &self.nulls {
457            size += bitmap.buffer().capacity()
458        }
459        for child in &self.child_data {
460            size += child.get_buffer_memory_size();
461        }
462        size
463    }
464
465    /// Returns the total number of the bytes of memory occupied by
466    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
467    ///
468    /// This is approximately the number of bytes if a new
469    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
470    /// exactly the data needed.
471    ///
472    /// For example, a [`DataType::Int64`] with `100` elements,
473    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
474    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
475    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
476    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
477    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
478        let mut result: usize = 0;
479        let layout = layout(&self.data_type);
480
481        for spec in layout.buffers.iter() {
482            match spec {
483                BufferSpec::FixedWidth { byte_width, .. } => {
484                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
485                        ArrowError::ComputeError(
486                            "Integer overflow computing buffer size".to_string(),
487                        )
488                    })?;
489                    result += buffer_size;
490                }
491                BufferSpec::VariableWidth => {
492                    let buffer_len = match self.data_type {
493                        DataType::Utf8 | DataType::Binary => {
494                            let offsets = self.typed_offsets::<i32>()?;
495                            (offsets[self.len] - offsets[0]) as usize
496                        }
497                        DataType::LargeUtf8 | DataType::LargeBinary => {
498                            let offsets = self.typed_offsets::<i64>()?;
499                            (offsets[self.len] - offsets[0]) as usize
500                        }
501                        _ => {
502                            return Err(ArrowError::NotYetImplemented(format!(
503                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
504                                self.data_type
505                            )));
506                        }
507                    };
508                    result += buffer_len;
509                }
510                BufferSpec::BitMap => {
511                    let buffer_size = bit_util::ceil(self.len, 8);
512                    result += buffer_size;
513                }
514                BufferSpec::AlwaysNull => {
515                    // Nothing to do
516                }
517            }
518        }
519
520        if self.nulls().is_some() {
521            result += bit_util::ceil(self.len, 8);
522        }
523
524        for child in &self.child_data {
525            result += child.get_slice_memory_size()?;
526        }
527        Ok(result)
528    }
529
530    /// Returns the total number of bytes of memory occupied
531    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
532    /// children. (See also diagram on [`ArrayData`]).
533    ///
534    /// Equivalent to:
535    ///  `size_of_val(self)` +
536    ///  [`Self::get_buffer_memory_size`] +
537    ///  `size_of_val(child)` for all children
538    pub fn get_array_memory_size(&self) -> usize {
539        let mut size = mem::size_of_val(self);
540
541        // Calculate rest of the fields top down which contain actual data
542        for buffer in &self.buffers {
543            size += mem::size_of::<Buffer>();
544            size += buffer.capacity();
545        }
546        if let Some(nulls) = &self.nulls {
547            size += nulls.buffer().capacity();
548        }
549        for child in &self.child_data {
550            size += child.get_array_memory_size();
551        }
552
553        size
554    }
555
556    /// Creates a zero-copy slice of itself. This creates a new
557    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
558    /// different offset and len
559    ///
560    /// # Panics
561    ///
562    /// Panics if `offset + length` overflows or is greater than `self.len()`.
563    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
564        let end = offset
565            .checked_add(length)
566            .expect("offset + length overflow");
567        assert!(end <= self.len());
568
569        if let DataType::Struct(_) = self.data_type() {
570            // Slice into children
571            let new_offset = self.offset + offset;
572            ArrayData {
573                data_type: self.data_type().clone(),
574                len: length,
575                offset: new_offset,
576                buffers: self.buffers.clone(),
577                // Slice child data, to propagate offsets down to them
578                child_data: self
579                    .child_data()
580                    .iter()
581                    .map(|data| data.slice(offset, length))
582                    .collect(),
583                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
584            }
585        } else {
586            let mut new_data = self.clone();
587
588            new_data.len = length;
589            new_data.offset = offset + self.offset;
590            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
591
592            new_data
593        }
594    }
595
596    /// Returns the `buffer` as a slice of type `T` starting at self.offset
597    ///
598    /// # Panics
599    /// This function panics if:
600    /// * the buffer is not byte-aligned with type T, or
601    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
602    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
603        &self.buffers()[buffer].typed_data()[self.offset..]
604    }
605
606    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
607    pub fn new_null(data_type: &DataType, len: usize) -> Self {
608        let bit_len = bit_util::ceil(len, 8);
609        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
610
611        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
612            Some(width) => (vec![zeroed(width * len)], vec![], true),
613            None => match data_type {
614                DataType::Null => (vec![], vec![], false),
615                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
616                DataType::Binary | DataType::Utf8 => {
617                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
618                }
619                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
620                DataType::LargeBinary | DataType::LargeUtf8 => {
621                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
622                }
623                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
624                DataType::List(f) | DataType::Map(f, _) => (
625                    vec![zeroed((len + 1) * 4)],
626                    vec![ArrayData::new_empty(f.data_type())],
627                    true,
628                ),
629                DataType::LargeList(f) => (
630                    vec![zeroed((len + 1) * 8)],
631                    vec![ArrayData::new_empty(f.data_type())],
632                    true,
633                ),
634                DataType::ListView(f) => (
635                    vec![zeroed(len * 4), zeroed(len * 4)],
636                    vec![ArrayData::new_empty(f.data_type())],
637                    true,
638                ),
639                DataType::LargeListView(f) => (
640                    vec![zeroed(len * 8), zeroed(len * 8)],
641                    vec![ArrayData::new_empty(f.data_type())],
642                    true,
643                ),
644                DataType::FixedSizeList(f, list_len) => (
645                    vec![],
646                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
647                    true,
648                ),
649                DataType::Struct(fields) => (
650                    vec![],
651                    fields
652                        .iter()
653                        .map(|f| Self::new_null(f.data_type(), len))
654                        .collect(),
655                    true,
656                ),
657                DataType::Dictionary(k, v) => (
658                    vec![zeroed(k.primitive_width().unwrap() * len)],
659                    vec![ArrayData::new_empty(v.as_ref())],
660                    true,
661                ),
662                DataType::Union(f, mode) => {
663                    let (id, _) = f.iter().next().unwrap();
664                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
665                    let buffers = match mode {
666                        UnionMode::Sparse => vec![ids],
667                        UnionMode::Dense => {
668                            let end_offset = i32::from_usize(len).unwrap();
669                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
670                        }
671                    };
672
673                    let children = f
674                        .iter()
675                        .enumerate()
676                        .map(|(idx, (_, f))| {
677                            if idx == 0 || *mode == UnionMode::Sparse {
678                                Self::new_null(f.data_type(), len)
679                            } else {
680                                Self::new_empty(f.data_type())
681                            }
682                        })
683                        .collect();
684
685                    (buffers, children, false)
686                }
687                DataType::RunEndEncoded(r, v) => {
688                    let runs = match r.data_type() {
689                        DataType::Int16 => {
690                            let i = i16::from_usize(len).expect("run overflow");
691                            Buffer::from_slice_ref([i])
692                        }
693                        DataType::Int32 => {
694                            let i = i32::from_usize(len).expect("run overflow");
695                            Buffer::from_slice_ref([i])
696                        }
697                        DataType::Int64 => {
698                            let i = i64::from_usize(len).expect("run overflow");
699                            Buffer::from_slice_ref([i])
700                        }
701                        dt => unreachable!("Invalid run ends data type {dt}"),
702                    };
703
704                    let builder = ArrayData::builder(r.data_type().clone())
705                        .len(1)
706                        .buffers(vec![runs]);
707
708                    // SAFETY:
709                    // Valid by construction
710                    let runs = unsafe { builder.build_unchecked() };
711                    (
712                        vec![],
713                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
714                        false,
715                    )
716                }
717                d => unreachable!("{d}"),
718            },
719        };
720
721        let mut builder = ArrayDataBuilder::new(data_type.clone())
722            .len(len)
723            .buffers(buffers)
724            .child_data(child_data);
725
726        if has_nulls {
727            builder = builder.nulls(Some(NullBuffer::new_null(len)))
728        }
729
730        // SAFETY:
731        // Data valid by construction
732        unsafe { builder.build_unchecked() }
733    }
734
735    /// Returns a new empty [ArrayData] valid for `data_type`.
736    pub fn new_empty(data_type: &DataType) -> Self {
737        Self::new_null(data_type, 0)
738    }
739
740    /// Verifies that the buffers meet the minimum alignment requirements for the data type
741    ///
742    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
743    ///
744    /// This can be useful for when interacting with data sent over IPC or FFI, that may
745    /// not meet the minimum alignment requirements
746    ///
747    /// This also aligns buffers of children data
748    pub fn align_buffers(&mut self) {
749        let layout = layout(&self.data_type);
750        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
751            if let BufferSpec::FixedWidth { alignment, .. } = spec {
752                if buffer.as_ptr().align_offset(*alignment) != 0 {
753                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
754                }
755            }
756        }
757        // align children data recursively
758        for data in self.child_data.iter_mut() {
759            data.align_buffers()
760        }
761    }
762
763    /// "cheap" validation of an `ArrayData`. Ensures buffers are
764    /// sufficiently sized to store `len` + `offset` total elements of
765    /// `data_type` and performs other inexpensive consistency checks.
766    ///
767    /// This check is "cheap" in the sense that it does not validate the
768    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
769    /// are within the bounds of the values buffer).
770    ///
771    /// See [ArrayData::validate_data] to validate fully the offset content
772    /// and the validity of utf8 data
773    pub fn validate(&self) -> Result<(), ArrowError> {
774        // Need at least this much space in each buffer
775        let len_plus_offset = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
776
777        // Check that the data layout conforms to the spec
778        let layout = layout(&self.data_type);
779
780        if !layout.can_contain_null_mask && self.nulls.is_some() {
781            return Err(ArrowError::InvalidArgumentError(format!(
782                "Arrays of type {:?} cannot contain a null bitmask",
783                self.data_type,
784            )));
785        }
786
787        // Check data buffers length for view types and other types
788        if self.buffers.len() < layout.buffers.len()
789            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
790        {
791            return Err(ArrowError::InvalidArgumentError(format!(
792                "Expected {} buffers in array of type {:?}, got {}",
793                layout.buffers.len(),
794                self.data_type,
795                self.buffers.len(),
796            )));
797        }
798
799        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
800            match spec {
801                BufferSpec::FixedWidth {
802                    byte_width,
803                    alignment,
804                } => {
805                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
806
807                    if buffer.len() < min_buffer_size {
808                        return Err(ArrowError::InvalidArgumentError(format!(
809                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
810                            min_buffer_size,
811                            i,
812                            self.data_type,
813                            buffer.len()
814                        )));
815                    }
816
817                    let align_offset = buffer.as_ptr().align_offset(*alignment);
818                    if align_offset != 0 {
819                        return Err(ArrowError::InvalidArgumentError(format!(
820                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
821                            self.data_type,
822                            align_offset.min(alignment - align_offset)
823                        )));
824                    }
825                }
826                BufferSpec::VariableWidth => {
827                    // not cheap to validate (need to look at the
828                    // data). Partially checked in validate_offsets
829                    // called below. Can check with `validate_full`
830                }
831                BufferSpec::BitMap => {
832                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
833                    if buffer.len() < min_buffer_size {
834                        return Err(ArrowError::InvalidArgumentError(format!(
835                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
836                            min_buffer_size,
837                            i,
838                            self.data_type,
839                            buffer.len()
840                        )));
841                    }
842                }
843                BufferSpec::AlwaysNull => {
844                    // Nothing to validate
845                }
846            }
847        }
848
849        // check null bit buffer size
850        if let Some(nulls) = self.nulls() {
851            if nulls.null_count() > self.len {
852                return Err(ArrowError::InvalidArgumentError(format!(
853                    "null_count {} for an array exceeds length of {} elements",
854                    nulls.null_count(),
855                    self.len
856                )));
857            }
858
859            let actual_len = nulls.validity().len();
860            let needed_len = bit_util::ceil(len_plus_offset, 8);
861            if actual_len < needed_len {
862                return Err(ArrowError::InvalidArgumentError(format!(
863                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
864                )));
865            }
866
867            if nulls.len() != self.len {
868                return Err(ArrowError::InvalidArgumentError(format!(
869                    "null buffer incorrect size. got {} expected {}",
870                    nulls.len(),
871                    self.len
872                )));
873            }
874        }
875
876        self.validate_child_data()?;
877
878        // Additional Type specific checks
879        match &self.data_type {
880            DataType::Utf8 | DataType::Binary => {
881                self.validate_offsets::<i32>(self.buffers[1].len())?;
882            }
883            DataType::LargeUtf8 | DataType::LargeBinary => {
884                self.validate_offsets::<i64>(self.buffers[1].len())?;
885            }
886            DataType::Dictionary(key_type, _value_type) => {
887                // At the moment, constructing a DictionaryArray will also check this
888                if !DataType::is_dictionary_key_type(key_type) {
889                    return Err(ArrowError::InvalidArgumentError(format!(
890                        "Dictionary key type must be integer, but was {key_type}"
891                    )));
892                }
893            }
894            DataType::RunEndEncoded(run_ends_type, _) => {
895                if run_ends_type.is_nullable() {
896                    return Err(ArrowError::InvalidArgumentError(
897                        "The nullable should be set to false for the field defining run_ends array.".to_string()
898                    ));
899                }
900                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
901                    return Err(ArrowError::InvalidArgumentError(format!(
902                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
903                        run_ends_type.data_type()
904                    )));
905                }
906            }
907            _ => {}
908        };
909
910        Ok(())
911    }
912
913    /// Returns a reference to the data in `buffer` as a typed slice
914    /// (typically `&[i32]` or `&[i64]`) after validating. The
915    /// returned slice is guaranteed to have at least `self.len + 1`
916    /// entries.
917    ///
918    /// For an empty array, the `buffer` can also be empty.
919    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
920        // An empty list-like array can have 0 offsets
921        if self.len == 0 && self.buffers[0].is_empty() {
922            return Ok(&[]);
923        }
924
925        let len = checked_len_plus_offset(&self.data_type, self.len, 1)?;
926
927        self.typed_buffer(0, len)
928    }
929
930    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
931    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
932        &self,
933        idx: usize,
934        len: usize,
935    ) -> Result<&[T], ArrowError> {
936        let buffer = &self.buffers[idx];
937
938        let required_elements = checked_len_plus_offset(&self.data_type, len, self.offset)?;
939        let byte_width = mem::size_of::<T>();
940        let required_len = required_elements.checked_mul(byte_width).ok_or_else(|| {
941            ArrowError::InvalidArgumentError(format!(
942                "Buffer {idx} of {} byte length overflow: {} elements of {} bytes exceeds usize",
943                self.data_type, required_elements, byte_width
944            ))
945        })?;
946
947        if buffer.len() < required_len {
948            return Err(ArrowError::InvalidArgumentError(format!(
949                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
950                idx,
951                self.data_type,
952                required_len,
953                buffer.len()
954            )));
955        }
956
957        Ok(&buffer.typed_data::<T>()[self.offset..required_elements])
958    }
959
960    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
961    /// offsets (of type T) into some other buffer of `values_length` bytes long
962    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
963        &self,
964        values_length: usize,
965    ) -> Result<(), ArrowError> {
966        // Justification: buffer size was validated above
967        let offsets = self.typed_offsets::<T>()?;
968        if offsets.is_empty() {
969            return Ok(());
970        }
971
972        let first_offset = offsets[0].to_usize().ok_or_else(|| {
973            ArrowError::InvalidArgumentError(format!(
974                "Error converting offset[0] ({}) to usize for {}",
975                offsets[0], self.data_type
976            ))
977        })?;
978
979        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
980            ArrowError::InvalidArgumentError(format!(
981                "Error converting offset[{}] ({}) to usize for {}",
982                self.len, offsets[self.len], self.data_type
983            ))
984        })?;
985
986        if first_offset > values_length {
987            return Err(ArrowError::InvalidArgumentError(format!(
988                "First offset {} of {} is larger than values length {}",
989                first_offset, self.data_type, values_length,
990            )));
991        }
992
993        if last_offset > values_length {
994            return Err(ArrowError::InvalidArgumentError(format!(
995                "Last offset {} of {} is larger than values length {}",
996                last_offset, self.data_type, values_length,
997            )));
998        }
999
1000        if first_offset > last_offset {
1001            return Err(ArrowError::InvalidArgumentError(format!(
1002                "First offset {} in {} is smaller than last offset {}",
1003                first_offset, self.data_type, last_offset,
1004            )));
1005        }
1006
1007        Ok(())
1008    }
1009
1010    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1011    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
1012    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1013        &self,
1014        values_length: usize,
1015    ) -> Result<(), ArrowError> {
1016        let offsets: &[T] = self.typed_buffer(0, self.len)?;
1017        let sizes: &[T] = self.typed_buffer(1, self.len)?;
1018        if offsets.len() != sizes.len() {
1019            return Err(ArrowError::ComputeError(format!(
1020                "ListView offsets len {} does not match sizes len {}",
1021                offsets.len(),
1022                sizes.len()
1023            )));
1024        }
1025
1026        for i in 0..sizes.len() {
1027            let size = sizes[i].to_usize().ok_or_else(|| {
1028                ArrowError::InvalidArgumentError(format!(
1029                    "Error converting size[{}] ({}) to usize for {}",
1030                    i, sizes[i], self.data_type
1031                ))
1032            })?;
1033            let offset = offsets[i].to_usize().ok_or_else(|| {
1034                ArrowError::InvalidArgumentError(format!(
1035                    "Error converting offset[{}] ({}) to usize for {}",
1036                    i, offsets[i], self.data_type
1037                ))
1038            })?;
1039            if size
1040                .checked_add(offset)
1041                .expect("Offset and size have exceeded the usize boundary")
1042                > values_length
1043            {
1044                return Err(ArrowError::InvalidArgumentError(format!(
1045                    "Size {} at index {} is larger than the remaining values for {}",
1046                    size, i, self.data_type
1047                )));
1048            }
1049        }
1050        Ok(())
1051    }
1052
1053    /// Validates the layout of `child_data` ArrayData structures
1054    fn validate_child_data(&self) -> Result<(), ArrowError> {
1055        match &self.data_type {
1056            DataType::List(field) | DataType::Map(field, _) => {
1057                let values_data = self.get_single_valid_child_data(field.data_type())?;
1058                self.validate_offsets::<i32>(values_data.len)?;
1059                Ok(())
1060            }
1061            DataType::LargeList(field) => {
1062                let values_data = self.get_single_valid_child_data(field.data_type())?;
1063                self.validate_offsets::<i64>(values_data.len)?;
1064                Ok(())
1065            }
1066            DataType::ListView(field) => {
1067                let values_data = self.get_single_valid_child_data(field.data_type())?;
1068                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1069                Ok(())
1070            }
1071            DataType::LargeListView(field) => {
1072                let values_data = self.get_single_valid_child_data(field.data_type())?;
1073                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1074                Ok(())
1075            }
1076            DataType::FixedSizeList(field, list_size) => {
1077                let values_data = self.get_single_valid_child_data(field.data_type())?;
1078
1079                let list_size: usize = (*list_size).try_into().map_err(|_| {
1080                    ArrowError::InvalidArgumentError(format!(
1081                        "{} has a negative list_size {}",
1082                        self.data_type, list_size
1083                    ))
1084                })?;
1085
1086                let expected_values_len = self.len
1087                    .checked_mul(list_size)
1088                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1089
1090                if values_data.len < expected_values_len {
1091                    return Err(ArrowError::InvalidArgumentError(format!(
1092                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1093                        values_data.len, self.len, list_size, self.data_type
1094                    )));
1095                }
1096
1097                Ok(())
1098            }
1099            DataType::Struct(fields) => {
1100                self.validate_num_child_data(fields.len())?;
1101                for (i, field) in fields.iter().enumerate() {
1102                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1103
1104                    // Ensure child field has sufficient size
1105                    if field_data.len < self.len {
1106                        return Err(ArrowError::InvalidArgumentError(format!(
1107                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1108                            self.data_type,
1109                            i,
1110                            field.name(),
1111                            field_data.len,
1112                            self.len
1113                        )));
1114                    }
1115                }
1116                Ok(())
1117            }
1118            DataType::RunEndEncoded(run_ends_field, values_field) => {
1119                self.validate_num_child_data(2)?;
1120                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1121                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1122                if run_ends_data.len != values_data.len {
1123                    return Err(ArrowError::InvalidArgumentError(format!(
1124                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1125                        run_ends_data.len, values_data.len
1126                    )));
1127                }
1128                if run_ends_data.nulls.is_some() {
1129                    return Err(ArrowError::InvalidArgumentError(
1130                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1131                    ));
1132                }
1133                Ok(())
1134            }
1135            DataType::Union(fields, mode) => {
1136                self.validate_num_child_data(fields.len())?;
1137
1138                for (i, (_, field)) in fields.iter().enumerate() {
1139                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1140
1141                    if mode == &UnionMode::Sparse {
1142                        let len_plus_offset =
1143                            checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1144                        if field_data.len < len_plus_offset {
1145                            return Err(ArrowError::InvalidArgumentError(format!(
1146                                "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1147                                i, field_data.len, len_plus_offset
1148                            )));
1149                        }
1150                    }
1151                }
1152                Ok(())
1153            }
1154            DataType::Dictionary(_key_type, value_type) => {
1155                self.get_single_valid_child_data(value_type)?;
1156                Ok(())
1157            }
1158            _ => {
1159                // other types do not have child data
1160                if !self.child_data.is_empty() {
1161                    return Err(ArrowError::InvalidArgumentError(format!(
1162                        "Expected no child arrays for type {} but got {}",
1163                        self.data_type,
1164                        self.child_data.len()
1165                    )));
1166                }
1167                Ok(())
1168            }
1169        }
1170    }
1171
1172    /// Ensures that this array data has a single child_data with the
1173    /// expected type, and calls `validate()` on it. Returns a
1174    /// reference to that child_data
1175    fn get_single_valid_child_data(
1176        &self,
1177        expected_type: &DataType,
1178    ) -> Result<&ArrayData, ArrowError> {
1179        self.validate_num_child_data(1)?;
1180        self.get_valid_child_data(0, expected_type)
1181    }
1182
1183    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1184    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1185        if self.child_data.len() != expected_len {
1186            Err(ArrowError::InvalidArgumentError(format!(
1187                "Value data for {} should contain {} child data array(s), had {}",
1188                self.data_type,
1189                expected_len,
1190                self.child_data.len()
1191            )))
1192        } else {
1193            Ok(())
1194        }
1195    }
1196
1197    /// Ensures that `child_data[i]` has the expected type, calls
1198    /// `validate()` on it, and returns a reference to that child_data
1199    fn get_valid_child_data(
1200        &self,
1201        i: usize,
1202        expected_type: &DataType,
1203    ) -> Result<&ArrayData, ArrowError> {
1204        let values_data = self.child_data.get(i).ok_or_else(|| {
1205            ArrowError::InvalidArgumentError(format!(
1206                "{} did not have enough child arrays. Expected at least {} but had only {}",
1207                self.data_type,
1208                i + 1,
1209                self.child_data.len()
1210            ))
1211        })?;
1212
1213        if expected_type != &values_data.data_type {
1214            return Err(ArrowError::InvalidArgumentError(format!(
1215                "Child type mismatch for {}. Expected {} but child data had {}",
1216                self.data_type, expected_type, values_data.data_type
1217            )));
1218        }
1219
1220        values_data.validate()?;
1221        Ok(values_data)
1222    }
1223
1224    /// Validate that the data contained within this [`ArrayData`] is valid
1225    ///
1226    /// 1. Null count is correct
1227    /// 2. All offsets are valid
1228    /// 3. All String data is valid UTF-8
1229    /// 4. All dictionary offsets are valid
1230    ///
1231    /// Internally this calls:
1232    ///
1233    /// * [`Self::validate`]
1234    /// * [`Self::validate_nulls`]
1235    /// * [`Self::validate_values`]
1236    ///
1237    /// Note: this does not recurse into children, for a recursive variant
1238    /// see [`Self::validate_full`]
1239    pub fn validate_data(&self) -> Result<(), ArrowError> {
1240        self.validate()?;
1241
1242        self.validate_nulls()?;
1243        self.validate_values()?;
1244        Ok(())
1245    }
1246
1247    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1248    ///
1249    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1250    /// and all its children recursively
1251    pub fn validate_full(&self) -> Result<(), ArrowError> {
1252        self.validate_data()?;
1253        // validate all children recursively
1254        self.child_data
1255            .iter()
1256            .enumerate()
1257            .try_for_each(|(i, child_data)| {
1258                child_data.validate_full().map_err(|e| {
1259                    ArrowError::InvalidArgumentError(format!(
1260                        "{} child #{} invalid: {}",
1261                        self.data_type, i, e
1262                    ))
1263                })
1264            })?;
1265        Ok(())
1266    }
1267
1268    /// Validates the values stored within this [`ArrayData`] are valid
1269    /// without recursing into child [`ArrayData`]
1270    ///
1271    /// Does not (yet) check
1272    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1273    /// 2. the the null count is correct and that any
1274    /// 3. nullability requirements of its children are correct
1275    ///
1276    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1277    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1278        if let Some(nulls) = &self.nulls {
1279            let actual = nulls.len() - nulls.inner().count_set_bits();
1280            if actual != nulls.null_count() {
1281                return Err(ArrowError::InvalidArgumentError(format!(
1282                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1283                    nulls.null_count(),
1284                    actual
1285                )));
1286            }
1287        }
1288
1289        // In general non-nullable children should not contain nulls, however, for certain
1290        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1291        // space in the child. As such we permit nulls in the children in the corresponding
1292        // positions for such types
1293        match &self.data_type {
1294            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1295                if !f.is_nullable() {
1296                    self.validate_non_nullable(None, &self.child_data[0])?
1297                }
1298            }
1299            DataType::FixedSizeList(field, len) => {
1300                let child = &self.child_data[0];
1301                if !field.is_nullable() {
1302                    match &self.nulls {
1303                        Some(nulls) => {
1304                            let element_len = *len as usize;
1305                            let expanded = nulls.expand(element_len);
1306                            self.validate_non_nullable(Some(&expanded), child)?;
1307                        }
1308                        None => self.validate_non_nullable(None, child)?,
1309                    }
1310                }
1311            }
1312            DataType::Struct(fields) => {
1313                for (field, child) in fields.iter().zip(&self.child_data) {
1314                    if !field.is_nullable() {
1315                        self.validate_non_nullable(self.nulls(), child)?
1316                    }
1317                }
1318            }
1319            _ => {}
1320        }
1321
1322        Ok(())
1323    }
1324
1325    /// Verifies that `child` contains no nulls not present in `mask`
1326    fn validate_non_nullable(
1327        &self,
1328        mask: Option<&NullBuffer>,
1329        child: &ArrayData,
1330    ) -> Result<(), ArrowError> {
1331        let mask = match mask {
1332            Some(mask) => mask,
1333            None => {
1334                return match child.null_count() {
1335                    0 => Ok(()),
1336                    _ => Err(ArrowError::InvalidArgumentError(format!(
1337                        "non-nullable child of type {} contains nulls not present in parent {}",
1338                        child.data_type, self.data_type
1339                    ))),
1340                };
1341            }
1342        };
1343
1344        match child.nulls() {
1345            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1346                "non-nullable child of type {} contains nulls not present in parent",
1347                child.data_type
1348            ))),
1349            _ => Ok(()),
1350        }
1351    }
1352
1353    /// Validates the values stored within this [`ArrayData`] are valid
1354    /// without recursing into child [`ArrayData`]
1355    ///
1356    /// Does not (yet) check
1357    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1358    pub fn validate_values(&self) -> Result<(), ArrowError> {
1359        match &self.data_type {
1360            DataType::Utf8 => self.validate_utf8::<i32>(),
1361            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1362            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1363            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1364            DataType::BinaryView => {
1365                let views = self.typed_buffer::<u128>(0, self.len)?;
1366                validate_binary_view(views, &self.buffers[1..])
1367            }
1368            DataType::Utf8View => {
1369                let views = self.typed_buffer::<u128>(0, self.len)?;
1370                validate_string_view(views, &self.buffers[1..])
1371            }
1372            DataType::List(_) | DataType::Map(_, _) => {
1373                let child = &self.child_data[0];
1374                self.validate_offsets_full::<i32>(child.len)
1375            }
1376            DataType::LargeList(_) => {
1377                let child = &self.child_data[0];
1378                self.validate_offsets_full::<i64>(child.len)
1379            }
1380            DataType::Union(_, _) => {
1381                // Validate Union Array as part of implementing new Union semantics
1382                // See comments in `ArrayData::validate()`
1383                // https://github.com/apache/arrow-rs/issues/85
1384                //
1385                // TODO file follow on ticket for full union validation
1386                Ok(())
1387            }
1388            DataType::Dictionary(key_type, _value_type) => {
1389                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1390                let max_value = dictionary_length - 1;
1391                match key_type.as_ref() {
1392                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1393                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1394                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1395                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1396                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1397                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1398                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1399                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1400                    _ => unreachable!(),
1401                }
1402            }
1403            DataType::RunEndEncoded(run_ends, _values) => {
1404                let run_ends_data = self.child_data()[0].clone();
1405                match run_ends.data_type() {
1406                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1407                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1408                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1409                    _ => unreachable!(),
1410                }
1411            }
1412            _ => {
1413                // No extra validation check required for other types
1414                Ok(())
1415            }
1416        }
1417    }
1418
1419    /// Calls the `validate(item_index, range)` function for each of
1420    /// the ranges specified in the arrow offsets buffer of type
1421    /// `T`. Also validates that each offset is smaller than
1422    /// `offset_limit`
1423    ///
1424    /// For an empty array, the offsets buffer can either be empty
1425    /// or contain a single `0`.
1426    ///
1427    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1428    /// function would call `validate([1,2])`, and `validate([2,4])`
1429    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1430    where
1431        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1432        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1433    {
1434        self.typed_offsets::<T>()?
1435            .iter()
1436            .enumerate()
1437            .map(|(i, x)| {
1438                // check if the offset can be converted to usize
1439                let r = x.to_usize().ok_or_else(|| {
1440                    ArrowError::InvalidArgumentError(format!(
1441                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1442                    );
1443                // check if the offset exceeds the limit
1444                match r {
1445                    Ok(n) if n <= offset_limit => Ok((i, n)),
1446                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1447                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1448                    ),
1449                    Err(e) => Err(e),
1450                }
1451            })
1452            .scan(0_usize, |start, end| {
1453                // check offsets are monotonically increasing
1454                match end {
1455                    Ok((i, end)) if *start <= end => {
1456                        let range = Some(Ok((i, *start..end)));
1457                        *start = end;
1458                        range
1459                    }
1460                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1461                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1462                        i - 1, start, end))
1463                    )),
1464                    Err(err) => Some(Err(err)),
1465                }
1466            })
1467            .skip(1) // the first element is meaningless
1468            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1469                let (item_index, range) = res?;
1470                validate(item_index-1, range)
1471            })
1472    }
1473
1474    /// Ensures that all strings formed by the offsets in `buffers[0]`
1475    /// into `buffers[1]` are valid utf8 sequences
1476    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1477    where
1478        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1479    {
1480        let values_buffer = &self.buffers[1].as_slice();
1481        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1482            // Validate Offsets are correct
1483            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1484                if !values_str.is_char_boundary(range.start)
1485                    || !values_str.is_char_boundary(range.end)
1486                {
1487                    return Err(ArrowError::InvalidArgumentError(format!(
1488                        "incomplete utf-8 byte sequence from index {string_index}"
1489                    )));
1490                }
1491                Ok(())
1492            })
1493        } else {
1494            // find specific offset that failed utf8 validation
1495            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1496                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1497                    ArrowError::InvalidArgumentError(format!(
1498                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1499                    ))
1500                })?;
1501                Ok(())
1502            })
1503        }
1504    }
1505
1506    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1507    /// between `0` and `offset_limit`
1508    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1509    where
1510        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1511    {
1512        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1513            // No validation applied to each value, but the iteration
1514            // itself applies bounds checking to each range
1515            Ok(())
1516        })
1517    }
1518
1519    /// Validates that each value in self.buffers (typed as T)
1520    /// is within the range [0, max_value], inclusive
1521    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1522    where
1523        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1524    {
1525        let required_len = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1526        let buffer = &self.buffers[0];
1527
1528        // This should have been checked as part of `validate()` prior
1529        // to calling `validate_full()` but double check to be sure
1530        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1531
1532        // Justification: buffer size was validated above
1533        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..required_len];
1534
1535        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1536            // Do not check the value is null (value can be arbitrary)
1537            if self.is_null(i) {
1538                return Ok(());
1539            }
1540            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1541                ArrowError::InvalidArgumentError(format!(
1542                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1543                ))
1544            })?;
1545
1546            if dict_index < 0 || dict_index > max_value {
1547                return Err(ArrowError::InvalidArgumentError(format!(
1548                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1549                )));
1550            }
1551            Ok(())
1552        })
1553    }
1554
1555    /// Validates that each value in run_ends array is positive and strictly increasing.
1556    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1557    where
1558        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1559    {
1560        let values = self.typed_buffer::<T>(0, self.len)?;
1561        let mut prev_value: i64 = 0_i64;
1562        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1563            let value: i64 = inp_value.try_into().map_err(|_| {
1564                ArrowError::InvalidArgumentError(format!(
1565                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1566                ))
1567            })?;
1568            if value <= 0_i64 {
1569                return Err(ArrowError::InvalidArgumentError(format!(
1570                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1571                )));
1572            }
1573            if ix > 0 && value <= prev_value {
1574                return Err(ArrowError::InvalidArgumentError(format!(
1575                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1576                )));
1577            }
1578
1579            prev_value = value;
1580            Ok(())
1581        })?;
1582
1583        let len_plus_offset = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1584        if prev_value.as_usize() < len_plus_offset {
1585            return Err(ArrowError::InvalidArgumentError(format!(
1586                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1587                len_plus_offset
1588            )));
1589        }
1590        Ok(())
1591    }
1592
1593    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1594    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1595    /// return false when the arrays are logically equal
1596    pub fn ptr_eq(&self, other: &Self) -> bool {
1597        if self.offset != other.offset
1598            || self.len != other.len
1599            || self.data_type != other.data_type
1600            || self.buffers.len() != other.buffers.len()
1601            || self.child_data.len() != other.child_data.len()
1602        {
1603            return false;
1604        }
1605
1606        match (&self.nulls, &other.nulls) {
1607            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1608            (Some(_), None) | (None, Some(_)) => return false,
1609            _ => {}
1610        };
1611
1612        if !self
1613            .buffers
1614            .iter()
1615            .zip(other.buffers.iter())
1616            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1617        {
1618            return false;
1619        }
1620
1621        self.child_data
1622            .iter()
1623            .zip(other.child_data.iter())
1624            .all(|(a, b)| a.ptr_eq(b))
1625    }
1626
1627    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1628    pub fn into_builder(self) -> ArrayDataBuilder {
1629        self.into()
1630    }
1631}
1632
1633/// Return the expected [`DataTypeLayout`] Arrays of this data
1634/// type are expected to have
1635pub fn layout(data_type: &DataType) -> DataTypeLayout {
1636    // based on C/C++ implementation in
1637    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1638    use arrow_schema::IntervalUnit::*;
1639
1640    match data_type {
1641        DataType::Null => DataTypeLayout {
1642            buffers: vec![],
1643            can_contain_null_mask: false,
1644            variadic: false,
1645        },
1646        DataType::Boolean => DataTypeLayout {
1647            buffers: vec![BufferSpec::BitMap],
1648            can_contain_null_mask: true,
1649            variadic: false,
1650        },
1651        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1652        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1653        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1654        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1655        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1656        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1657        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1658        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1659        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1660        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1661        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1662        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1663        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1664        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1665        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1666        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1667        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1668        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1669        DataType::Interval(MonthDayNano) => {
1670            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1671        }
1672        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1673        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1674        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1675        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1676        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1677        DataType::FixedSizeBinary(size) => {
1678            let spec = BufferSpec::FixedWidth {
1679                byte_width: (*size).try_into().unwrap(),
1680                alignment: mem::align_of::<u8>(),
1681            };
1682            DataTypeLayout {
1683                buffers: vec![spec],
1684                can_contain_null_mask: true,
1685                variadic: false,
1686            }
1687        }
1688        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1689        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1690        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1691        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1692        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1693        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1694        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1695        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1696        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1697        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1698        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1699        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1700        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1701        DataType::Union(_, mode) => {
1702            let type_ids = BufferSpec::FixedWidth {
1703                byte_width: mem::size_of::<i8>(),
1704                alignment: mem::align_of::<i8>(),
1705            };
1706
1707            DataTypeLayout {
1708                buffers: match mode {
1709                    UnionMode::Sparse => {
1710                        vec![type_ids]
1711                    }
1712                    UnionMode::Dense => {
1713                        vec![
1714                            type_ids,
1715                            BufferSpec::FixedWidth {
1716                                byte_width: mem::size_of::<i32>(),
1717                                alignment: mem::align_of::<i32>(),
1718                            },
1719                        ]
1720                    }
1721                },
1722                can_contain_null_mask: false,
1723                variadic: false,
1724            }
1725        }
1726        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1727    }
1728}
1729
1730/// Layout specification for a data type
1731#[derive(Debug, PartialEq, Eq)]
1732// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1733pub struct DataTypeLayout {
1734    /// A vector of buffer layout specifications, one for each expected buffer
1735    pub buffers: Vec<BufferSpec>,
1736
1737    /// Can contain a null bitmask
1738    pub can_contain_null_mask: bool,
1739
1740    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1741    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1742    /// buffers.len(). Buffers that exceed the lower bound are legal.
1743    pub variadic: bool,
1744}
1745
1746impl DataTypeLayout {
1747    /// Describes a basic numeric array where each element has type `T`
1748    pub fn new_fixed_width<T>() -> Self {
1749        Self {
1750            buffers: vec![BufferSpec::FixedWidth {
1751                byte_width: mem::size_of::<T>(),
1752                alignment: mem::align_of::<T>(),
1753            }],
1754            can_contain_null_mask: true,
1755            variadic: false,
1756        }
1757    }
1758
1759    /// Describes arrays which have no data of their own
1760    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1761    pub fn new_nullable_empty() -> Self {
1762        Self {
1763            buffers: vec![],
1764            can_contain_null_mask: true,
1765            variadic: false,
1766        }
1767    }
1768
1769    /// Describes arrays which have no data of their own
1770    /// (e.g. RunEndEncoded).
1771    pub fn new_empty() -> Self {
1772        Self {
1773            buffers: vec![],
1774            can_contain_null_mask: false,
1775            variadic: false,
1776        }
1777    }
1778
1779    /// Describes a basic numeric array where each element has a fixed
1780    /// with offset buffer of type `T`, followed by a
1781    /// variable width data buffer
1782    pub fn new_binary<T>() -> Self {
1783        Self {
1784            buffers: vec![
1785                // offsets
1786                BufferSpec::FixedWidth {
1787                    byte_width: mem::size_of::<T>(),
1788                    alignment: mem::align_of::<T>(),
1789                },
1790                // values
1791                BufferSpec::VariableWidth,
1792            ],
1793            can_contain_null_mask: true,
1794            variadic: false,
1795        }
1796    }
1797
1798    /// Describes a view type
1799    pub fn new_view() -> Self {
1800        Self {
1801            buffers: vec![BufferSpec::FixedWidth {
1802                byte_width: mem::size_of::<u128>(),
1803                alignment: mem::align_of::<u128>(),
1804            }],
1805            can_contain_null_mask: true,
1806            variadic: true,
1807        }
1808    }
1809
1810    /// Describes a list view type
1811    pub fn new_list_view<T>() -> Self {
1812        Self {
1813            buffers: vec![
1814                BufferSpec::FixedWidth {
1815                    byte_width: mem::size_of::<T>(),
1816                    alignment: mem::align_of::<T>(),
1817                },
1818                BufferSpec::FixedWidth {
1819                    byte_width: mem::size_of::<T>(),
1820                    alignment: mem::align_of::<T>(),
1821                },
1822            ],
1823            can_contain_null_mask: true,
1824            variadic: false,
1825        }
1826    }
1827}
1828
1829/// Layout specification for a single data type buffer
1830#[derive(Debug, PartialEq, Eq)]
1831pub enum BufferSpec {
1832    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1833    ///
1834    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1835    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1836    ///
1837    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1838    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1839    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1840    ///
1841    /// Note that these alignment requirements will vary between architectures
1842    FixedWidth {
1843        /// The width of each element in bytes
1844        byte_width: usize,
1845        /// The alignment required by Rust for an array of the corresponding primitive
1846        alignment: usize,
1847    },
1848    /// Variable width, such as string data for utf8 data
1849    VariableWidth,
1850    /// Buffer holds a bitmap.
1851    ///
1852    /// Note: Unlike the C++ implementation, the null/validity buffer
1853    /// is handled specially rather than as another of the buffers in
1854    /// the spec, so this variant is only used for the Boolean type.
1855    BitMap,
1856    /// Buffer is always null. Unused currently in Rust implementation,
1857    /// (used in C++ for Union type)
1858    #[allow(dead_code)]
1859    AlwaysNull,
1860}
1861
1862impl PartialEq for ArrayData {
1863    fn eq(&self, other: &Self) -> bool {
1864        equal::equal(self, other)
1865    }
1866}
1867
1868/// A boolean flag that cannot be mutated outside of unsafe code.
1869///
1870/// Defaults to a value of false.
1871///
1872/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1873///
1874/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1875///
1876/// # Example
1877/// ```rust
1878/// use arrow_data::UnsafeFlag;
1879/// assert!(!UnsafeFlag::default().get()); // default is false
1880/// let mut flag = UnsafeFlag::new();
1881/// assert!(!flag.get()); // defaults to false
1882/// // can only set it to true in unsafe code
1883/// unsafe { flag.set(true) };
1884/// assert!(flag.get()); // now true
1885/// ```
1886#[derive(Debug, Clone)]
1887#[doc(hidden)]
1888pub struct UnsafeFlag(bool);
1889
1890impl UnsafeFlag {
1891    /// Creates a new `UnsafeFlag` with the value set to `false`.
1892    ///
1893    /// See examples on [`Self::new`]
1894    #[inline]
1895    pub const fn new() -> Self {
1896        Self(false)
1897    }
1898
1899    /// Sets the value of the flag to the given value
1900    ///
1901    /// Note this can purposely only be done in `unsafe` code
1902    ///
1903    /// # Safety
1904    ///
1905    /// If set, the flag will be set to the given value. There is nothing
1906    /// immediately unsafe about doing so, however, the flag can be used to
1907    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1908    #[inline]
1909    pub unsafe fn set(&mut self, val: bool) {
1910        self.0 = val;
1911    }
1912
1913    /// Returns the value of the flag
1914    #[inline]
1915    pub fn get(&self) -> bool {
1916        self.0
1917    }
1918}
1919
1920// Manual impl to make it clear you can not construct unsafe with true
1921impl Default for UnsafeFlag {
1922    fn default() -> Self {
1923        Self::new()
1924    }
1925}
1926
1927/// Builder for [`ArrayData`] type
1928#[derive(Debug)]
1929pub struct ArrayDataBuilder {
1930    data_type: DataType,
1931    len: usize,
1932    null_count: Option<usize>,
1933    null_bit_buffer: Option<Buffer>,
1934    nulls: Option<NullBuffer>,
1935    offset: usize,
1936    buffers: Vec<Buffer>,
1937    child_data: Vec<ArrayData>,
1938    /// Should buffers be realigned (copying if necessary)?
1939    ///
1940    /// Defaults to false.
1941    align_buffers: bool,
1942    /// Should data validation be skipped for this [`ArrayData`]?
1943    ///
1944    /// Defaults to false.
1945    ///
1946    /// # Safety
1947    ///
1948    /// This flag can only be set to true using `unsafe` APIs. However, once true
1949    /// subsequent calls to `build()` may result in undefined behavior if the data
1950    /// is not valid.
1951    skip_validation: UnsafeFlag,
1952}
1953
1954impl ArrayDataBuilder {
1955    #[inline]
1956    /// Creates a new array data builder
1957    pub const fn new(data_type: DataType) -> Self {
1958        Self {
1959            data_type,
1960            len: 0,
1961            null_count: None,
1962            null_bit_buffer: None,
1963            nulls: None,
1964            offset: 0,
1965            buffers: vec![],
1966            child_data: vec![],
1967            align_buffers: false,
1968            skip_validation: UnsafeFlag::new(),
1969        }
1970    }
1971
1972    /// Creates a new array data builder from an existing one, changing the data type
1973    pub fn data_type(self, data_type: DataType) -> Self {
1974        Self { data_type, ..self }
1975    }
1976
1977    #[inline]
1978    #[allow(clippy::len_without_is_empty)]
1979    /// Sets the length of the [ArrayData]
1980    pub const fn len(mut self, n: usize) -> Self {
1981        self.len = n;
1982        self
1983    }
1984
1985    /// Sets the null buffer of the [ArrayData]
1986    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1987        self.nulls = nulls;
1988        self.null_count = None;
1989        self.null_bit_buffer = None;
1990        self
1991    }
1992
1993    /// Sets the null count of the [ArrayData]
1994    pub fn null_count(mut self, null_count: usize) -> Self {
1995        self.null_count = Some(null_count);
1996        self
1997    }
1998
1999    /// Sets the `null_bit_buffer` of the [ArrayData]
2000    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2001        self.nulls = None;
2002        self.null_bit_buffer = buf;
2003        self
2004    }
2005
2006    /// Sets the offset of the [ArrayData]
2007    #[inline]
2008    pub const fn offset(mut self, n: usize) -> Self {
2009        self.offset = n;
2010        self
2011    }
2012
2013    /// Sets the buffers of the [ArrayData]
2014    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2015        self.buffers = v;
2016        self
2017    }
2018
2019    /// Adds a single buffer to the [ArrayData]'s buffers
2020    pub fn add_buffer(mut self, b: Buffer) -> Self {
2021        self.buffers.push(b);
2022        self
2023    }
2024
2025    /// Adds multiple buffers to the [ArrayData]'s buffers
2026    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2027        self.buffers.extend(bs);
2028        self
2029    }
2030
2031    /// Sets the child data of the [ArrayData]
2032    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2033        self.child_data = v;
2034        self
2035    }
2036
2037    /// Adds a single child data to the [ArrayData]'s child data
2038    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2039        self.child_data.push(r);
2040        self
2041    }
2042
2043    /// Creates an array data, without any validation
2044    ///
2045    /// Note: This is shorthand for
2046    /// ```rust
2047    /// # #[expect(unsafe_op_in_unsafe_fn)]
2048    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2049    /// # let _ = unsafe {
2050    /// builder.skip_validation(true).build().unwrap()
2051    /// # };
2052    /// ```
2053    ///
2054    /// # Safety
2055    ///
2056    /// The same caveats as [`ArrayData::new_unchecked`]
2057    /// apply.
2058    pub unsafe fn build_unchecked(self) -> ArrayData {
2059        unsafe { self.skip_validation(true) }.build().unwrap()
2060    }
2061
2062    /// Creates an `ArrayData`, consuming `self`
2063    ///
2064    /// # Safety
2065    ///
2066    /// By default the underlying buffers are checked to ensure they are valid
2067    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2068    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2069    /// not valid, undefined behavior will result.
2070    pub fn build(self) -> Result<ArrayData, ArrowError> {
2071        let Self {
2072            data_type,
2073            len,
2074            null_count,
2075            null_bit_buffer,
2076            nulls,
2077            offset,
2078            buffers,
2079            child_data,
2080            align_buffers,
2081            skip_validation,
2082        } = self;
2083
2084        let nulls = nulls
2085            .or_else(|| {
2086                let buffer = null_bit_buffer?;
2087                let buffer = BooleanBuffer::new(buffer, offset, len);
2088                Some(match null_count {
2089                    Some(n) => {
2090                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2091                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2092                    }
2093                    None => NullBuffer::new(buffer),
2094                })
2095            })
2096            .filter(|b| b.null_count() != 0);
2097
2098        let mut data = ArrayData {
2099            data_type,
2100            len,
2101            offset,
2102            buffers,
2103            child_data,
2104            nulls,
2105        };
2106
2107        if align_buffers {
2108            data.align_buffers();
2109        }
2110
2111        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2112        if !skip_validation.get() || cfg!(feature = "force_validate") {
2113            data.validate_data()?;
2114        }
2115        Ok(data)
2116    }
2117
2118    /// Creates an array data, validating all inputs, and aligning any buffers
2119    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2120    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2121        self.align_buffers(true).build()
2122    }
2123
2124    /// Ensure that all buffers are aligned, copying data if necessary
2125    ///
2126    /// Rust requires that arrays are aligned to their corresponding primitive,
2127    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2128    ///
2129    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2130    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2131    ///
2132    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2133    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2134    /// when necessary, making it useful when interacting with buffers produced by other systems,
2135    /// e.g. IPC or FFI.
2136    ///
2137    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2138    /// insufficiently aligned buffers.
2139    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2140        self.align_buffers = align_buffers;
2141        self
2142    }
2143
2144    /// Skips validation of the data.
2145    ///
2146    /// If this flag is enabled, `[Self::build`] will skip validation of the
2147    /// data
2148    ///
2149    /// If this flag is not enabled, `[Self::build`] will validate that all
2150    /// buffers are valid and will return an error if any data is invalid.
2151    /// Validation can be expensive.
2152    ///
2153    /// # Safety
2154    ///
2155    /// If validation is skipped, the buffers must form a valid Arrow array,
2156    /// otherwise undefined behavior will result
2157    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2158        unsafe {
2159            self.skip_validation.set(skip_validation);
2160        }
2161        self
2162    }
2163}
2164
2165impl From<ArrayData> for ArrayDataBuilder {
2166    fn from(d: ArrayData) -> Self {
2167        Self {
2168            data_type: d.data_type,
2169            len: d.len,
2170            offset: d.offset,
2171            buffers: d.buffers,
2172            child_data: d.child_data,
2173            nulls: d.nulls,
2174            null_bit_buffer: None,
2175            null_count: None,
2176            align_buffers: false,
2177            skip_validation: UnsafeFlag::new(),
2178        }
2179    }
2180}
2181
2182#[cfg(test)]
2183mod tests {
2184    use super::*;
2185    use arrow_schema::{Field, Fields};
2186
2187    // See arrow/tests/array_data_validation.rs for test of array validation
2188
2189    /// returns a buffer initialized with some constant value for tests
2190    fn make_i32_buffer(n: usize) -> Buffer {
2191        Buffer::from_slice_ref(vec![42i32; n])
2192    }
2193
2194    /// returns a buffer initialized with some constant value for tests
2195    fn make_f32_buffer(n: usize) -> Buffer {
2196        Buffer::from_slice_ref(vec![42f32; n])
2197    }
2198
2199    #[test]
2200    fn test_builder() {
2201        // Buffer needs to be at least 25 long
2202        let v = (0..25).collect::<Vec<i32>>();
2203        let b1 = Buffer::from_slice_ref(&v);
2204        let arr_data = ArrayData::builder(DataType::Int32)
2205            .len(20)
2206            .offset(5)
2207            .add_buffer(b1)
2208            .null_bit_buffer(Some(Buffer::from([
2209                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2210            ])))
2211            .build()
2212            .unwrap();
2213
2214        assert_eq!(20, arr_data.len());
2215        assert_eq!(10, arr_data.null_count());
2216        assert_eq!(5, arr_data.offset());
2217        assert_eq!(1, arr_data.buffers().len());
2218        assert_eq!(
2219            Buffer::from_slice_ref(&v).as_slice(),
2220            arr_data.buffers()[0].as_slice()
2221        );
2222    }
2223
2224    #[test]
2225    fn test_builder_with_child_data() {
2226        let child_arr_data = ArrayData::try_new(
2227            DataType::Int32,
2228            5,
2229            None,
2230            0,
2231            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2232            vec![],
2233        )
2234        .unwrap();
2235
2236        let field = Arc::new(Field::new("x", DataType::Int32, true));
2237        let data_type = DataType::Struct(vec![field].into());
2238
2239        let arr_data = ArrayData::builder(data_type)
2240            .len(5)
2241            .offset(0)
2242            .add_child_data(child_arr_data.clone())
2243            .build()
2244            .unwrap();
2245
2246        assert_eq!(5, arr_data.len());
2247        assert_eq!(1, arr_data.child_data().len());
2248        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2249    }
2250
2251    #[test]
2252    fn test_null_count() {
2253        let mut bit_v: [u8; 2] = [0; 2];
2254        bit_util::set_bit(&mut bit_v, 0);
2255        bit_util::set_bit(&mut bit_v, 3);
2256        bit_util::set_bit(&mut bit_v, 10);
2257        let arr_data = ArrayData::builder(DataType::Int32)
2258            .len(16)
2259            .add_buffer(make_i32_buffer(16))
2260            .null_bit_buffer(Some(Buffer::from(bit_v)))
2261            .build()
2262            .unwrap();
2263        assert_eq!(13, arr_data.null_count());
2264
2265        // Test with offset
2266        let mut bit_v: [u8; 2] = [0; 2];
2267        bit_util::set_bit(&mut bit_v, 0);
2268        bit_util::set_bit(&mut bit_v, 3);
2269        bit_util::set_bit(&mut bit_v, 10);
2270        let arr_data = ArrayData::builder(DataType::Int32)
2271            .len(12)
2272            .offset(2)
2273            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2274            .null_bit_buffer(Some(Buffer::from(bit_v)))
2275            .build()
2276            .unwrap();
2277        assert_eq!(10, arr_data.null_count());
2278    }
2279
2280    #[test]
2281    fn test_null_buffer_ref() {
2282        let mut bit_v: [u8; 2] = [0; 2];
2283        bit_util::set_bit(&mut bit_v, 0);
2284        bit_util::set_bit(&mut bit_v, 3);
2285        bit_util::set_bit(&mut bit_v, 10);
2286        let arr_data = ArrayData::builder(DataType::Int32)
2287            .len(16)
2288            .add_buffer(make_i32_buffer(16))
2289            .null_bit_buffer(Some(Buffer::from(bit_v)))
2290            .build()
2291            .unwrap();
2292        assert!(arr_data.nulls().is_some());
2293        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2294    }
2295
2296    #[test]
2297    fn test_slice() {
2298        let mut bit_v: [u8; 2] = [0; 2];
2299        bit_util::set_bit(&mut bit_v, 0);
2300        bit_util::set_bit(&mut bit_v, 3);
2301        bit_util::set_bit(&mut bit_v, 10);
2302        let data = ArrayData::builder(DataType::Int32)
2303            .len(16)
2304            .add_buffer(make_i32_buffer(16))
2305            .null_bit_buffer(Some(Buffer::from(bit_v)))
2306            .build()
2307            .unwrap();
2308        let new_data = data.slice(1, 15);
2309        assert_eq!(data.len() - 1, new_data.len());
2310        assert_eq!(1, new_data.offset());
2311        assert_eq!(data.null_count(), new_data.null_count());
2312
2313        // slice of a slice (removes one null)
2314        let new_data = new_data.slice(1, 14);
2315        assert_eq!(data.len() - 2, new_data.len());
2316        assert_eq!(2, new_data.offset());
2317        assert_eq!(data.null_count() - 1, new_data.null_count());
2318    }
2319
2320    #[test]
2321    #[should_panic(expected = "offset + length overflow")]
2322    fn test_slice_panics_on_offset_length_overflow() {
2323        let data = ArrayData::builder(DataType::Int32)
2324            .len(4)
2325            .add_buffer(make_i32_buffer(4))
2326            .build()
2327            .unwrap();
2328        let sliced = data.slice(1, 3);
2329
2330        sliced.slice(1, usize::MAX);
2331    }
2332
2333    #[test]
2334    fn test_typed_offsets_length_overflow() {
2335        let data = ArrayData {
2336            data_type: DataType::Binary,
2337            len: usize::MAX,
2338            offset: 0,
2339            buffers: vec![Buffer::from_slice_ref([0_i32])],
2340            child_data: vec![],
2341            nulls: None,
2342        };
2343        let err = data.typed_offsets::<i32>().unwrap_err();
2344
2345        assert_eq!(
2346            err.to_string(),
2347            format!(
2348                "Invalid argument error: Length {} with offset 1 overflows usize for Binary",
2349                usize::MAX
2350            )
2351        );
2352    }
2353
2354    #[test]
2355    fn test_validate_typed_buffer_length_overflow() {
2356        let data = ArrayData {
2357            data_type: DataType::Binary,
2358            len: 0,
2359            offset: 2,
2360            buffers: vec![Buffer::from_slice_ref([0_i32])],
2361            child_data: vec![],
2362            nulls: None,
2363        };
2364        let err = data.typed_buffer::<i32>(0, usize::MAX).unwrap_err();
2365
2366        assert_eq!(
2367            err.to_string(),
2368            format!(
2369                "Invalid argument error: Length {} with offset 2 overflows usize for Binary",
2370                usize::MAX
2371            )
2372        );
2373    }
2374
2375    // Exercises ArrayData::try_new with len + offset overflowing
2376    fn try_new_binary_length_offset_overflow() -> Result<ArrayData, ArrowError> {
2377        ArrayData::try_new(
2378            DataType::Binary,
2379            usize::MAX,
2380            None,
2381            1,
2382            vec![
2383                Buffer::from_slice_ref([0_i32]),
2384                Buffer::from_iter(std::iter::empty::<u8>()),
2385            ],
2386            vec![],
2387        )
2388    }
2389
2390    #[cfg(not(feature = "force_validate"))]
2391    #[test]
2392    fn test_try_new_length_offset_overflow() {
2393        let err = try_new_binary_length_offset_overflow().unwrap_err();
2394
2395        assert_eq!(
2396            err.to_string(),
2397            format!(
2398                "Invalid argument error: Length {} with offset 1 overflows usize for Binary",
2399                usize::MAX
2400            )
2401        );
2402    }
2403
2404    #[cfg(feature = "force_validate")]
2405    #[test]
2406    #[should_panic(
2407        expected = "Length 18446744073709551615 with offset 1 overflows usize for Binary"
2408    )]
2409    fn test_try_new_length_offset_overflow_force_validate() {
2410        try_new_binary_length_offset_overflow().unwrap();
2411    }
2412
2413    #[test]
2414    fn test_equality() {
2415        let int_data = ArrayData::builder(DataType::Int32)
2416            .len(1)
2417            .add_buffer(make_i32_buffer(1))
2418            .build()
2419            .unwrap();
2420
2421        let float_data = ArrayData::builder(DataType::Float32)
2422            .len(1)
2423            .add_buffer(make_f32_buffer(1))
2424            .build()
2425            .unwrap();
2426        assert_ne!(int_data, float_data);
2427        assert!(!int_data.ptr_eq(&float_data));
2428        assert!(int_data.ptr_eq(&int_data));
2429
2430        #[allow(clippy::redundant_clone)]
2431        let int_data_clone = int_data.clone();
2432        assert_eq!(int_data, int_data_clone);
2433        assert!(int_data.ptr_eq(&int_data_clone));
2434        assert!(int_data_clone.ptr_eq(&int_data));
2435
2436        let int_data_slice = int_data_clone.slice(1, 0);
2437        assert!(int_data_slice.ptr_eq(&int_data_slice));
2438        assert!(!int_data.ptr_eq(&int_data_slice));
2439        assert!(!int_data_slice.ptr_eq(&int_data));
2440
2441        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2442        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2443        let string_data = ArrayData::try_new(
2444            DataType::Utf8,
2445            3,
2446            Some(Buffer::from_iter(vec![true, false, true])),
2447            0,
2448            vec![offsets_buffer, data_buffer],
2449            vec![],
2450        )
2451        .unwrap();
2452
2453        assert_ne!(float_data, string_data);
2454        assert!(!float_data.ptr_eq(&string_data));
2455
2456        assert!(string_data.ptr_eq(&string_data));
2457
2458        #[allow(clippy::redundant_clone)]
2459        let string_data_cloned = string_data.clone();
2460        assert!(string_data_cloned.ptr_eq(&string_data));
2461        assert!(string_data.ptr_eq(&string_data_cloned));
2462
2463        let string_data_slice = string_data.slice(1, 2);
2464        assert!(string_data_slice.ptr_eq(&string_data_slice));
2465        assert!(!string_data_slice.ptr_eq(&string_data))
2466    }
2467
2468    #[test]
2469    fn test_slice_memory_size() {
2470        let mut bit_v: [u8; 2] = [0; 2];
2471        bit_util::set_bit(&mut bit_v, 0);
2472        bit_util::set_bit(&mut bit_v, 3);
2473        bit_util::set_bit(&mut bit_v, 10);
2474        let data = ArrayData::builder(DataType::Int32)
2475            .len(16)
2476            .add_buffer(make_i32_buffer(16))
2477            .null_bit_buffer(Some(Buffer::from(bit_v)))
2478            .build()
2479            .unwrap();
2480        let new_data = data.slice(1, 14);
2481        assert_eq!(
2482            data.get_slice_memory_size().unwrap() - 8,
2483            new_data.get_slice_memory_size().unwrap()
2484        );
2485        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2486        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2487        let string_data = ArrayData::try_new(
2488            DataType::Utf8,
2489            3,
2490            Some(Buffer::from_iter(vec![true, false, true])),
2491            0,
2492            vec![offsets_buffer, data_buffer],
2493            vec![],
2494        )
2495        .unwrap();
2496        let string_data_slice = string_data.slice(1, 2);
2497        //4 bytes of offset and 2 bytes of data reduced by slicing.
2498        assert_eq!(
2499            string_data.get_slice_memory_size().unwrap() - 6,
2500            string_data_slice.get_slice_memory_size().unwrap()
2501        );
2502    }
2503
2504    #[test]
2505    fn test_count_nulls() {
2506        let buffer = Buffer::from([0b00010110, 0b10011111]);
2507        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2508        let count = count_nulls(Some(&buffer), 0, 16);
2509        assert_eq!(count, 7);
2510
2511        let count = count_nulls(Some(&buffer), 4, 8);
2512        assert_eq!(count, 3);
2513    }
2514
2515    #[test]
2516    fn test_contains_nulls() {
2517        let buffer: Buffer =
2518            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2519        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2520        assert!(contains_nulls(Some(&buffer), 0, 6));
2521        assert!(contains_nulls(Some(&buffer), 0, 3));
2522        assert!(!contains_nulls(Some(&buffer), 3, 2));
2523        assert!(!contains_nulls(Some(&buffer), 0, 0));
2524    }
2525
2526    #[test]
2527    fn test_alignment() {
2528        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2529        let sliced = buffer.slice(1);
2530
2531        let mut data = ArrayData {
2532            data_type: DataType::Int32,
2533            len: 0,
2534            offset: 0,
2535            buffers: vec![buffer],
2536            child_data: vec![],
2537            nulls: None,
2538        };
2539        data.validate_full().unwrap();
2540
2541        // break alignment in data
2542        data.buffers[0] = sliced;
2543        let err = data.validate().unwrap_err();
2544
2545        assert_eq!(
2546            err.to_string(),
2547            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2548        );
2549
2550        data.align_buffers();
2551        data.validate_full().unwrap();
2552    }
2553
2554    #[test]
2555    fn test_alignment_struct() {
2556        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2557        let sliced = buffer.slice(1);
2558
2559        let child_data = ArrayData {
2560            data_type: DataType::Int32,
2561            len: 0,
2562            offset: 0,
2563            buffers: vec![buffer],
2564            child_data: vec![],
2565            nulls: None,
2566        };
2567
2568        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2569        let mut data = ArrayData {
2570            data_type: schema,
2571            len: 0,
2572            offset: 0,
2573            buffers: vec![],
2574            child_data: vec![child_data],
2575            nulls: None,
2576        };
2577        data.validate_full().unwrap();
2578
2579        // break alignment in child data
2580        data.child_data[0].buffers[0] = sliced;
2581        let err = data.validate().unwrap_err();
2582
2583        assert_eq!(
2584            err.to_string(),
2585            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2586        );
2587
2588        data.align_buffers();
2589        data.validate_full().unwrap();
2590    }
2591
2592    #[test]
2593    fn test_null_view_types() {
2594        let array_len = 32;
2595        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2596        assert_eq!(array.len(), array_len);
2597        for i in 0..array.len() {
2598            assert!(array.is_null(i));
2599        }
2600
2601        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2602        assert_eq!(array.len(), array_len);
2603        for i in 0..array.len() {
2604            assert!(array.is_null(i));
2605        }
2606
2607        let array = ArrayData::new_null(
2608            &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2609            array_len,
2610        );
2611        assert_eq!(array.len(), array_len);
2612        for i in 0..array.len() {
2613            assert!(array.is_null(i));
2614        }
2615
2616        let array = ArrayData::new_null(
2617            &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2618            array_len,
2619        );
2620        assert_eq!(array.len(), array_len);
2621        for i in 0..array.len() {
2622            assert!(array.is_null(i));
2623        }
2624    }
2625}