Skip to main content

arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer, bit_util, i256,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::{equal, validate_binary_view, validate_string_view};
32
33#[inline]
34pub(crate) fn contains_nulls(
35    null_bit_buffer: Option<&NullBuffer>,
36    offset: usize,
37    len: usize,
38) -> bool {
39    match null_bit_buffer {
40        Some(buffer) => {
41            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
42                Some((start, end)) => start != 0 || end != len,
43                None => len != 0, // No non-null values
44            }
45        }
46        None => false, // No null buffer
47    }
48}
49
50#[inline]
51pub(crate) fn count_nulls(
52    null_bit_buffer: Option<&NullBuffer>,
53    offset: usize,
54    len: usize,
55) -> usize {
56    if let Some(buf) = null_bit_buffer {
57        let buffer = buf.buffer();
58        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
59    } else {
60        0
61    }
62}
63
64/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
65#[inline]
66pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
67    let empty_buffer = MutableBuffer::new(0);
68    match data_type {
69        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
70        DataType::Boolean => {
71            let bytes = bit_util::ceil(capacity, 8);
72            let buffer = MutableBuffer::new(bytes);
73            [buffer, empty_buffer]
74        }
75        DataType::UInt8
76        | DataType::UInt16
77        | DataType::UInt32
78        | DataType::UInt64
79        | DataType::Int8
80        | DataType::Int16
81        | DataType::Int32
82        | DataType::Int64
83        | DataType::Float16
84        | DataType::Float32
85        | DataType::Float64
86        | DataType::Decimal32(_, _)
87        | DataType::Decimal64(_, _)
88        | DataType::Decimal128(_, _)
89        | DataType::Decimal256(_, _)
90        | DataType::Date32
91        | DataType::Time32(_)
92        | DataType::Date64
93        | DataType::Time64(_)
94        | DataType::Duration(_)
95        | DataType::Timestamp(_, _)
96        | DataType::Interval(_) => [
97            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
98            empty_buffer,
99        ],
100        DataType::Utf8 | DataType::Binary => {
101            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
102            // safety: `unsafe` code assumes that this buffer is initialized with one element
103            buffer.push(0i32);
104            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
105        }
106        DataType::LargeUtf8 | DataType::LargeBinary => {
107            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
108            // safety: `unsafe` code assumes that this buffer is initialized with one element
109            buffer.push(0i64);
110            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
111        }
112        DataType::BinaryView | DataType::Utf8View => [
113            MutableBuffer::new(capacity * mem::size_of::<u128>()),
114            empty_buffer,
115        ],
116        DataType::List(_) | DataType::Map(_, _) => {
117            // offset buffer always starts with a zero
118            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
119            buffer.push(0i32);
120            [buffer, empty_buffer]
121        }
122        DataType::ListView(_) => [
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124            MutableBuffer::new(capacity * mem::size_of::<i32>()),
125        ],
126        DataType::LargeList(_) => {
127            // offset buffer always starts with a zero
128            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
129            buffer.push(0i64);
130            [buffer, empty_buffer]
131        }
132        DataType::LargeListView(_) => [
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134            MutableBuffer::new(capacity * mem::size_of::<i64>()),
135        ],
136        DataType::FixedSizeBinary(size) => {
137            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
138        }
139        DataType::Dictionary(k, _) => [
140            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
141            empty_buffer,
142        ],
143        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
144            [empty_buffer, MutableBuffer::new(0)]
145        }
146        DataType::Union(_, mode) => {
147            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
148            match mode {
149                UnionMode::Sparse => [type_ids, empty_buffer],
150                UnionMode::Dense => {
151                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
152                    [type_ids, offsets]
153                }
154            }
155        }
156    }
157}
158
159/// A generic representation of Arrow array data which encapsulates common attributes
160/// and operations for Arrow array.
161///
162/// Specific operations for different arrays types (e.g., primitive, list, struct)
163/// are implemented in `Array`.
164///
165/// # Memory Layout
166///
167/// `ArrayData` has references to one or more underlying data buffers
168/// and optional child ArrayData, depending on type as illustrated
169/// below. Bitmaps are not shown for simplicity but they are stored
170/// similarly to the buffers.
171///
172/// ```text
173///                        offset
174///                       points to
175/// ┌───────────────────┐ start of  ┌───────┐       Different
176/// │                   │   data    │       │     ArrayData may
177/// │ArrayData {        │           │....   │     also refers to
178/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
179/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
180/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
181/// │  buffers: [       │           │5882   │◀─
182/// │    ...            │  │        │4323   │
183/// │  ]                │   ─ ─ ─ ─▶│4859   │
184/// │  child_data: [    │           │....   │
185/// │    ...            │           │       │
186/// │  ]                │           └───────┘
187/// │}                  │
188/// │                   │            Shared Buffer uses
189/// │               │   │            bytes::Bytes to hold
190/// └───────────────────┘            actual data values
191///           ┌ ─ ─ ┘
192///
193///           ▼
194/// ┌───────────────────┐
195/// │ArrayData {        │
196/// │  ...              │
197/// │}                  │
198/// │                   │
199/// └───────────────────┘
200///
201/// Child ArrayData may also have its own buffers and children
202/// ```
203
204#[derive(Debug, Clone)]
205pub struct ArrayData {
206    /// The data type
207    data_type: DataType,
208
209    /// The number of elements
210    len: usize,
211
212    /// The offset in number of items (not bytes).
213    ///
214    /// The offset applies to [`Self::child_data`] and [`Self::buffers`]. It
215    /// does NOT apply to [`Self::nulls`].
216    offset: usize,
217
218    /// The buffers that store the actual data for this array, as defined
219    /// in the [Arrow Spec].
220    ///
221    /// Depending on the array types, [`Self::buffers`] can hold different
222    /// kinds of buffers (e.g., value buffer, value offset buffer) at different
223    /// positions.
224    ///
225    /// The buffer may be larger than needed.  Some items at the beginning may be skipped if
226    /// there is an `offset`.  Some items at the end may be skipped if the buffer is longer than
227    /// we need to satisfy `len`.
228    ///
229    /// [Arrow Spec](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout)
230    buffers: Vec<Buffer>,
231
232    /// The child(ren) of this array.
233    ///
234    /// Only non-empty for nested types, such as `ListArray` and
235    /// `StructArray`.
236    ///
237    /// The first logical element in each child element begins at `offset`.
238    ///
239    /// If the child element also has an offset then these offsets are
240    /// cumulative.
241    child_data: Vec<ArrayData>,
242
243    /// The null bitmap.
244    ///
245    /// `None` indicates all values are non-null in this array.
246    ///
247    /// [`Self::offset]` does not apply to the null bitmap. While the
248    /// BooleanBuffer may be sliced (have its own offset) internally, this
249    /// `NullBuffer` always represents exactly `len` elements.
250    nulls: Option<NullBuffer>,
251}
252
253/// A thread-safe, shared reference to the Arrow array data.
254pub type ArrayDataRef = Arc<ArrayData>;
255
256fn checked_len_plus_offset(
257    data_type: &DataType,
258    len: usize,
259    offset: usize,
260) -> Result<usize, ArrowError> {
261    len.checked_add(offset).ok_or_else(|| {
262        ArrowError::InvalidArgumentError(format!(
263            "Length {len} with offset {offset} overflows usize for {data_type}"
264        ))
265    })
266}
267
268impl ArrayData {
269    /// Create a new ArrayData instance;
270    ///
271    /// If `null_count` is not specified, the number of nulls in
272    /// null_bit_buffer is calculated.
273    ///
274    /// If the number of nulls is 0 then the null_bit_buffer
275    /// is set to `None`.
276    ///
277    /// # Safety
278    ///
279    /// The input values *must* form a valid Arrow array for
280    /// `data_type`, or undefined behavior can result.
281    ///
282    /// Note: This is a low level API and most users of the arrow
283    /// crate should create arrays using the methods in the `array`
284    /// module.
285    pub unsafe fn new_unchecked(
286        data_type: DataType,
287        len: usize,
288        null_count: Option<usize>,
289        null_bit_buffer: Option<Buffer>,
290        offset: usize,
291        buffers: Vec<Buffer>,
292        child_data: Vec<ArrayData>,
293    ) -> Self {
294        let mut skip_validation = UnsafeFlag::new();
295        // SAFETY: caller responsible for ensuring data is valid
296        unsafe { skip_validation.set(true) };
297
298        ArrayDataBuilder {
299            data_type,
300            len,
301            null_count,
302            null_bit_buffer,
303            nulls: None,
304            offset,
305            buffers,
306            child_data,
307            align_buffers: false,
308            skip_validation,
309        }
310        .build()
311        .unwrap()
312    }
313
314    /// Create a new ArrayData, validating that the provided buffers form a valid
315    /// Arrow array of the specified data type.
316    ///
317    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
318    /// is set to `None`.
319    ///
320    /// Internally this calls through to [`Self::validate_data`]
321    ///
322    /// Note: This is a low level API and most users of the arrow crate should create
323    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
324    /// or [`ArrayDataBuilder`].
325    ///
326    /// See also [`Self::into_parts`] to recover the fields
327    pub fn try_new(
328        data_type: DataType,
329        len: usize,
330        null_bit_buffer: Option<Buffer>,
331        offset: usize,
332        buffers: Vec<Buffer>,
333        child_data: Vec<ArrayData>,
334    ) -> Result<Self, ArrowError> {
335        // we must check the length of `null_bit_buffer` first
336        // because we use this buffer to calculate `null_count`
337        // in `Self::new_unchecked`.
338        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
339            let len_plus_offset = checked_len_plus_offset(&data_type, len, offset)?;
340            let needed_len = bit_util::ceil(len_plus_offset, 8);
341            if null_bit_buffer.len() < needed_len {
342                return Err(ArrowError::InvalidArgumentError(format!(
343                    "null_bit_buffer size too small. got {} needed {}",
344                    null_bit_buffer.len(),
345                    needed_len
346                )));
347            }
348        }
349        // Safety justification: `validate_full` is called below
350        let new_self = unsafe {
351            Self::new_unchecked(
352                data_type,
353                len,
354                None,
355                null_bit_buffer,
356                offset,
357                buffers,
358                child_data,
359            )
360        };
361
362        // As the data is not trusted, do a full validation of its contents
363        // We don't need to validate children as we can assume that the
364        // [`ArrayData`] in `child_data` have already been validated through
365        // a call to `ArrayData::try_new` or created using unsafe
366        new_self.validate_data()?;
367        Ok(new_self)
368    }
369
370    /// Return the constituent parts of this ArrayData
371    ///
372    /// This is the inverse of [`ArrayData::try_new`].
373    ///
374    /// Returns `(data_type, len, nulls, offset, buffers, child_data)`
375    pub fn into_parts(
376        self,
377    ) -> (
378        DataType,
379        usize,
380        Option<NullBuffer>,
381        usize,
382        Vec<Buffer>,
383        Vec<ArrayData>,
384    ) {
385        let Self {
386            data_type,
387            len,
388            nulls,
389            offset,
390            buffers,
391            child_data,
392        } = self;
393
394        (data_type, len, nulls, offset, buffers, child_data)
395    }
396
397    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
398    #[inline]
399    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
400        ArrayDataBuilder::new(data_type)
401    }
402
403    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
404    #[inline]
405    pub const fn data_type(&self) -> &DataType {
406        &self.data_type
407    }
408
409    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
410    pub fn buffers(&self) -> &[Buffer] {
411        &self.buffers
412    }
413
414    /// Returns a slice of children [`ArrayData`]. This will be non
415    /// empty for type such as lists and structs.
416    pub fn child_data(&self) -> &[ArrayData] {
417        &self.child_data[..]
418    }
419
420    /// Returns whether the element at index `i` is null
421    #[inline]
422    pub fn is_null(&self, i: usize) -> bool {
423        match &self.nulls {
424            Some(v) => v.is_null(i),
425            None => false,
426        }
427    }
428
429    /// Returns a reference to the null buffer of this [`ArrayData`] if any
430    ///
431    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
432    #[inline]
433    pub fn nulls(&self) -> Option<&NullBuffer> {
434        self.nulls.as_ref()
435    }
436
437    /// Returns whether the element at index `i` is not null
438    #[inline]
439    pub fn is_valid(&self, i: usize) -> bool {
440        !self.is_null(i)
441    }
442
443    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
444    #[inline]
445    pub const fn len(&self) -> usize {
446        self.len
447    }
448
449    /// Returns whether this [`ArrayData`] is empty
450    #[inline]
451    pub const fn is_empty(&self) -> bool {
452        self.len == 0
453    }
454
455    /// Returns the offset of this [`ArrayData`]
456    #[inline]
457    pub const fn offset(&self) -> usize {
458        self.offset
459    }
460
461    /// Returns the total number of nulls in this array
462    #[inline]
463    pub fn null_count(&self) -> usize {
464        self.nulls
465            .as_ref()
466            .map(|x| x.null_count())
467            .unwrap_or_default()
468    }
469
470    /// Returns the total number of bytes of memory occupied by the
471    /// buffers owned by this [`ArrayData`] and all of its
472    /// children. (See also diagram on [`ArrayData`]).
473    ///
474    /// Note that this [`ArrayData`] may only refer to a subset of the
475    /// data in the underlying [`Buffer`]s (due to `offset` and
476    /// `length`), but the size returned includes the entire size of
477    /// the buffers.
478    ///
479    /// If multiple [`ArrayData`]s refer to the same underlying
480    /// [`Buffer`]s they will both report the same size.
481    pub fn get_buffer_memory_size(&self) -> usize {
482        let mut size = 0;
483        for buffer in &self.buffers {
484            size += buffer.capacity();
485        }
486        if let Some(bitmap) = &self.nulls {
487            size += bitmap.buffer().capacity()
488        }
489        for child in &self.child_data {
490            size += child.get_buffer_memory_size();
491        }
492        size
493    }
494
495    /// Returns the total number of the bytes of memory occupied by
496    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
497    ///
498    /// This is approximately the number of bytes if a new
499    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
500    /// exactly the data needed.
501    ///
502    /// For example, a [`DataType::Int64`] with `100` elements,
503    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
504    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
505    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
506    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
507    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
508        let mut result: usize = 0;
509        let layout = layout(&self.data_type);
510
511        for spec in layout.buffers.iter() {
512            match spec {
513                BufferSpec::FixedWidth { byte_width, .. } => {
514                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
515                        ArrowError::ComputeError(
516                            "Integer overflow computing buffer size".to_string(),
517                        )
518                    })?;
519                    result += buffer_size;
520                }
521                BufferSpec::VariableWidth => {
522                    let buffer_len = match self.data_type {
523                        DataType::Utf8 | DataType::Binary => {
524                            let offsets = self.typed_offsets::<i32>()?;
525                            (offsets[self.len] - offsets[0]) as usize
526                        }
527                        DataType::LargeUtf8 | DataType::LargeBinary => {
528                            let offsets = self.typed_offsets::<i64>()?;
529                            (offsets[self.len] - offsets[0]) as usize
530                        }
531                        _ => {
532                            return Err(ArrowError::NotYetImplemented(format!(
533                                "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
534                                self.data_type
535                            )));
536                        }
537                    };
538                    result += buffer_len;
539                }
540                BufferSpec::BitMap => {
541                    let buffer_size = bit_util::ceil(self.len, 8);
542                    result += buffer_size;
543                }
544                BufferSpec::AlwaysNull => {
545                    // Nothing to do
546                }
547            }
548        }
549
550        if self.nulls().is_some() {
551            result += bit_util::ceil(self.len, 8);
552        }
553
554        for child in &self.child_data {
555            result += child.get_slice_memory_size()?;
556        }
557        Ok(result)
558    }
559
560    /// Returns the total number of bytes of memory occupied
561    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
562    /// children. (See also diagram on [`ArrayData`]).
563    ///
564    /// Equivalent to:
565    ///  `size_of_val(self)` +
566    ///  [`Self::get_buffer_memory_size`] +
567    ///  `size_of_val(child)` for all children
568    pub fn get_array_memory_size(&self) -> usize {
569        let mut size = mem::size_of_val(self);
570
571        // Calculate rest of the fields top down which contain actual data
572        for buffer in &self.buffers {
573            size += mem::size_of::<Buffer>();
574            size += buffer.capacity();
575        }
576        if let Some(nulls) = &self.nulls {
577            size += nulls.buffer().capacity();
578        }
579        for child in &self.child_data {
580            size += child.get_array_memory_size();
581        }
582
583        size
584    }
585
586    /// Creates a zero-copy slice of itself. This creates a new
587    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
588    /// different offset and len
589    ///
590    /// # Panics
591    ///
592    /// Panics if `offset + length` overflows or is greater than `self.len()`.
593    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
594        let end = offset
595            .checked_add(length)
596            .expect("offset + length overflow");
597        assert!(end <= self.len());
598
599        if let DataType::Struct(_) = self.data_type() {
600            // Slice into children
601            let new_offset = self.offset + offset;
602            ArrayData {
603                data_type: self.data_type().clone(),
604                len: length,
605                offset: new_offset,
606                buffers: self.buffers.clone(),
607                // Slice child data, to propagate offsets down to them
608                child_data: self
609                    .child_data()
610                    .iter()
611                    .map(|data| data.slice(offset, length))
612                    .collect(),
613                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
614            }
615        } else {
616            let mut new_data = self.clone();
617
618            new_data.len = length;
619            new_data.offset = offset + self.offset;
620            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
621
622            new_data
623        }
624    }
625
626    /// Returns the `buffer` as a slice of type `T` starting at self.offset
627    ///
628    /// # Panics
629    /// This function panics if:
630    /// * the buffer is not byte-aligned with type T, or
631    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
632    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
633        &self.buffers()[buffer].typed_data()[self.offset..]
634    }
635
636    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
637    pub fn new_null(data_type: &DataType, len: usize) -> Self {
638        let bit_len = bit_util::ceil(len, 8);
639        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
640
641        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
642            Some(width) => (vec![zeroed(width * len)], vec![], true),
643            None => match data_type {
644                DataType::Null => (vec![], vec![], false),
645                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
646                DataType::Binary | DataType::Utf8 => {
647                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
648                }
649                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
650                DataType::LargeBinary | DataType::LargeUtf8 => {
651                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
652                }
653                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
654                DataType::List(f) | DataType::Map(f, _) => (
655                    vec![zeroed((len + 1) * 4)],
656                    vec![ArrayData::new_empty(f.data_type())],
657                    true,
658                ),
659                DataType::LargeList(f) => (
660                    vec![zeroed((len + 1) * 8)],
661                    vec![ArrayData::new_empty(f.data_type())],
662                    true,
663                ),
664                DataType::ListView(f) => (
665                    vec![zeroed(len * 4), zeroed(len * 4)],
666                    vec![ArrayData::new_empty(f.data_type())],
667                    true,
668                ),
669                DataType::LargeListView(f) => (
670                    vec![zeroed(len * 8), zeroed(len * 8)],
671                    vec![ArrayData::new_empty(f.data_type())],
672                    true,
673                ),
674                DataType::FixedSizeList(f, list_len) => (
675                    vec![],
676                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
677                    true,
678                ),
679                DataType::Struct(fields) => (
680                    vec![],
681                    fields
682                        .iter()
683                        .map(|f| Self::new_null(f.data_type(), len))
684                        .collect(),
685                    true,
686                ),
687                DataType::Dictionary(k, v) => (
688                    vec![zeroed(k.primitive_width().unwrap() * len)],
689                    vec![ArrayData::new_empty(v.as_ref())],
690                    true,
691                ),
692                DataType::Union(f, mode) => {
693                    let (id, _) = f.iter().next().unwrap();
694                    let ids = Buffer::from_iter(std::iter::repeat_n(id, len));
695                    let buffers = match mode {
696                        UnionMode::Sparse => vec![ids],
697                        UnionMode::Dense => {
698                            let end_offset = i32::from_usize(len).unwrap();
699                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
700                        }
701                    };
702
703                    let children = f
704                        .iter()
705                        .enumerate()
706                        .map(|(idx, (_, f))| {
707                            if idx == 0 || *mode == UnionMode::Sparse {
708                                Self::new_null(f.data_type(), len)
709                            } else {
710                                Self::new_empty(f.data_type())
711                            }
712                        })
713                        .collect();
714
715                    (buffers, children, false)
716                }
717                DataType::RunEndEncoded(r, v) => {
718                    if len == 0 {
719                        // For empty arrays, create zero-length child arrays.
720                        let runs = ArrayData::new_empty(r.data_type());
721                        let values = ArrayData::new_empty(v.data_type());
722                        (vec![], vec![runs, values], false)
723                    } else {
724                        let runs = match r.data_type() {
725                            DataType::Int16 => {
726                                let i = i16::from_usize(len).expect("run overflow");
727                                Buffer::from_slice_ref([i])
728                            }
729                            DataType::Int32 => {
730                                let i = i32::from_usize(len).expect("run overflow");
731                                Buffer::from_slice_ref([i])
732                            }
733                            DataType::Int64 => {
734                                let i = i64::from_usize(len).expect("run overflow");
735                                Buffer::from_slice_ref([i])
736                            }
737                            dt => unreachable!("Invalid run ends data type {dt}"),
738                        };
739
740                        let builder = ArrayData::builder(r.data_type().clone())
741                            .len(1)
742                            .buffers(vec![runs]);
743
744                        // SAFETY:
745                        // Valid by construction
746                        let runs = unsafe { builder.build_unchecked() };
747                        (
748                            vec![],
749                            vec![runs, ArrayData::new_null(v.data_type(), 1)],
750                            false,
751                        )
752                    }
753                }
754                // Handled by Some(width) branch above
755                DataType::Int8
756                | DataType::Int16
757                | DataType::Int32
758                | DataType::Int64
759                | DataType::UInt8
760                | DataType::UInt16
761                | DataType::UInt32
762                | DataType::UInt64
763                | DataType::Float16
764                | DataType::Float32
765                | DataType::Float64
766                | DataType::Timestamp(_, _)
767                | DataType::Date32
768                | DataType::Date64
769                | DataType::Time32(_)
770                | DataType::Time64(_)
771                | DataType::Duration(_)
772                | DataType::Interval(_)
773                | DataType::Decimal32(_, _)
774                | DataType::Decimal64(_, _)
775                | DataType::Decimal128(_, _)
776                | DataType::Decimal256(_, _) => unreachable!("{data_type}"),
777            },
778        };
779
780        let mut builder = ArrayDataBuilder::new(data_type.clone())
781            .len(len)
782            .buffers(buffers)
783            .child_data(child_data);
784
785        if has_nulls {
786            builder = builder.nulls(Some(NullBuffer::new_null(len)))
787        }
788
789        // SAFETY:
790        // Data valid by construction
791        unsafe { builder.build_unchecked() }
792    }
793
794    /// Returns a new empty [ArrayData] valid for `data_type`.
795    pub fn new_empty(data_type: &DataType) -> Self {
796        Self::new_null(data_type, 0)
797    }
798
799    /// Verifies that the buffers meet the minimum alignment requirements for the data type
800    ///
801    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
802    ///
803    /// This can be useful for when interacting with data sent over IPC or FFI, that may
804    /// not meet the minimum alignment requirements
805    ///
806    /// This also aligns buffers of children data
807    pub fn align_buffers(&mut self) {
808        let layout = layout(&self.data_type);
809        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
810            if let BufferSpec::FixedWidth { alignment, .. } = spec {
811                if buffer.as_ptr().align_offset(*alignment) != 0 {
812                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
813                }
814            }
815        }
816        // align children data recursively
817        for data in self.child_data.iter_mut() {
818            data.align_buffers()
819        }
820    }
821
822    /// "cheap" validation of an `ArrayData`. Ensures buffers are
823    /// sufficiently sized to store `len` + `offset` total elements of
824    /// `data_type` and performs other inexpensive consistency checks.
825    ///
826    /// This check is "cheap" in the sense that it does not validate the
827    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
828    /// are within the bounds of the values buffer).
829    ///
830    /// See [ArrayData::validate_data] to validate fully the offset content
831    /// and the validity of utf8 data
832    pub fn validate(&self) -> Result<(), ArrowError> {
833        // Need at least this much space in each buffer
834        let len_plus_offset = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
835
836        // Check that the data layout conforms to the spec
837        let layout = layout(&self.data_type);
838
839        if !layout.can_contain_null_mask && self.nulls.is_some() {
840            return Err(ArrowError::InvalidArgumentError(format!(
841                "Arrays of type {:?} cannot contain a null bitmask",
842                self.data_type,
843            )));
844        }
845
846        // Check data buffers length for view types and other types
847        if self.buffers.len() < layout.buffers.len()
848            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
849        {
850            return Err(ArrowError::InvalidArgumentError(format!(
851                "Expected {} buffers in array of type {:?}, got {}",
852                layout.buffers.len(),
853                self.data_type,
854                self.buffers.len(),
855            )));
856        }
857
858        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
859            match spec {
860                BufferSpec::FixedWidth {
861                    byte_width,
862                    alignment,
863                } => {
864                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
865
866                    if buffer.len() < min_buffer_size {
867                        return Err(ArrowError::InvalidArgumentError(format!(
868                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
869                            min_buffer_size,
870                            i,
871                            self.data_type,
872                            buffer.len()
873                        )));
874                    }
875
876                    let align_offset = buffer.as_ptr().align_offset(*alignment);
877                    if align_offset != 0 {
878                        return Err(ArrowError::InvalidArgumentError(format!(
879                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
880                            self.data_type,
881                            align_offset.min(alignment - align_offset)
882                        )));
883                    }
884                }
885                BufferSpec::VariableWidth => {
886                    // not cheap to validate (need to look at the
887                    // data). Partially checked in validate_offsets
888                    // called below. Can check with `validate_full`
889                }
890                BufferSpec::BitMap => {
891                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
892                    if buffer.len() < min_buffer_size {
893                        return Err(ArrowError::InvalidArgumentError(format!(
894                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
895                            min_buffer_size,
896                            i,
897                            self.data_type,
898                            buffer.len()
899                        )));
900                    }
901                }
902                BufferSpec::AlwaysNull => {
903                    // Nothing to validate
904                }
905            }
906        }
907
908        // check null bit buffer size
909        if let Some(nulls) = self.nulls() {
910            if nulls.null_count() > self.len {
911                return Err(ArrowError::InvalidArgumentError(format!(
912                    "null_count {} for an array exceeds length of {} elements",
913                    nulls.null_count(),
914                    self.len
915                )));
916            }
917
918            let actual_len = nulls.validity().len();
919            let needed_len = bit_util::ceil(len_plus_offset, 8);
920            if actual_len < needed_len {
921                return Err(ArrowError::InvalidArgumentError(format!(
922                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
923                )));
924            }
925
926            if nulls.len() != self.len {
927                return Err(ArrowError::InvalidArgumentError(format!(
928                    "null buffer incorrect size. got {} expected {}",
929                    nulls.len(),
930                    self.len
931                )));
932            }
933        }
934
935        self.validate_child_data()?;
936
937        // Additional Type specific checks
938        match &self.data_type {
939            DataType::Utf8 | DataType::Binary => {
940                self.validate_offsets::<i32>(self.buffers[1].len())?;
941            }
942            DataType::LargeUtf8 | DataType::LargeBinary => {
943                self.validate_offsets::<i64>(self.buffers[1].len())?;
944            }
945            DataType::Dictionary(key_type, _value_type) => {
946                // At the moment, constructing a DictionaryArray will also check this
947                if !DataType::is_dictionary_key_type(key_type) {
948                    return Err(ArrowError::InvalidArgumentError(format!(
949                        "Dictionary key type must be integer, but was {key_type}"
950                    )));
951                }
952            }
953            DataType::RunEndEncoded(run_ends_type, _) => {
954                if run_ends_type.is_nullable() {
955                    return Err(ArrowError::InvalidArgumentError(
956                        "The nullable should be set to false for the field defining run_ends array.".to_string()
957                    ));
958                }
959                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
960                    return Err(ArrowError::InvalidArgumentError(format!(
961                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
962                        run_ends_type.data_type()
963                    )));
964                }
965            }
966            _ => {}
967        };
968
969        Ok(())
970    }
971
972    /// Returns a reference to the data in `buffer` as a typed slice
973    /// (typically `&[i32]` or `&[i64]`) after validating. The
974    /// returned slice is guaranteed to have at least `self.len + 1`
975    /// entries.
976    ///
977    /// For an empty array, the `buffer` can also be empty.
978    fn typed_offsets<T: ArrowNativeType + num_traits::Num>(&self) -> Result<&[T], ArrowError> {
979        // An empty list-like array can have 0 offsets
980        if self.len == 0 && self.buffers[0].is_empty() {
981            return Ok(&[]);
982        }
983
984        let len = checked_len_plus_offset(&self.data_type, self.len, 1)?;
985
986        self.typed_buffer(0, len)
987    }
988
989    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
990    fn typed_buffer<T: ArrowNativeType + num_traits::Num>(
991        &self,
992        idx: usize,
993        len: usize,
994    ) -> Result<&[T], ArrowError> {
995        let buffer = &self.buffers[idx];
996
997        let required_elements = checked_len_plus_offset(&self.data_type, len, self.offset)?;
998        let byte_width = mem::size_of::<T>();
999        let required_len = required_elements.checked_mul(byte_width).ok_or_else(|| {
1000            ArrowError::InvalidArgumentError(format!(
1001                "Buffer {idx} of {} byte length overflow: {} elements of {} bytes exceeds usize",
1002                self.data_type, required_elements, byte_width
1003            ))
1004        })?;
1005
1006        if buffer.len() < required_len {
1007            return Err(ArrowError::InvalidArgumentError(format!(
1008                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
1009                idx,
1010                self.data_type,
1011                required_len,
1012                buffer.len()
1013            )));
1014        }
1015
1016        Ok(&buffer.typed_data::<T>()[self.offset..required_elements])
1017    }
1018
1019    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1020    /// offsets (of type T) into some other buffer of `values_length` bytes long
1021    fn validate_offsets<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1022        &self,
1023        values_length: usize,
1024    ) -> Result<(), ArrowError> {
1025        // Justification: buffer size was validated above
1026        let offsets = self.typed_offsets::<T>()?;
1027        if offsets.is_empty() {
1028            return Ok(());
1029        }
1030
1031        let first_offset = offsets[0].to_usize().ok_or_else(|| {
1032            ArrowError::InvalidArgumentError(format!(
1033                "Error converting offset[0] ({}) to usize for {}",
1034                offsets[0], self.data_type
1035            ))
1036        })?;
1037
1038        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
1039            ArrowError::InvalidArgumentError(format!(
1040                "Error converting offset[{}] ({}) to usize for {}",
1041                self.len, offsets[self.len], self.data_type
1042            ))
1043        })?;
1044
1045        if first_offset > values_length {
1046            return Err(ArrowError::InvalidArgumentError(format!(
1047                "First offset {} of {} is larger than values length {}",
1048                first_offset, self.data_type, values_length,
1049            )));
1050        }
1051
1052        if last_offset > values_length {
1053            return Err(ArrowError::InvalidArgumentError(format!(
1054                "Last offset {} of {} is larger than values length {}",
1055                last_offset, self.data_type, values_length,
1056            )));
1057        }
1058
1059        if first_offset > last_offset {
1060            return Err(ArrowError::InvalidArgumentError(format!(
1061                "First offset {} in {} is smaller than last offset {}",
1062                first_offset, self.data_type, last_offset,
1063            )));
1064        }
1065
1066        Ok(())
1067    }
1068
1069    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
1070    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
1071    fn validate_offsets_and_sizes<T: ArrowNativeType + num_traits::Num + std::fmt::Display>(
1072        &self,
1073        values_length: usize,
1074    ) -> Result<(), ArrowError> {
1075        let offsets: &[T] = self.typed_buffer(0, self.len)?;
1076        let sizes: &[T] = self.typed_buffer(1, self.len)?;
1077        if offsets.len() != sizes.len() {
1078            return Err(ArrowError::ComputeError(format!(
1079                "ListView offsets len {} does not match sizes len {}",
1080                offsets.len(),
1081                sizes.len()
1082            )));
1083        }
1084
1085        for i in 0..sizes.len() {
1086            let size = sizes[i].to_usize().ok_or_else(|| {
1087                ArrowError::InvalidArgumentError(format!(
1088                    "Error converting size[{}] ({}) to usize for {}",
1089                    i, sizes[i], self.data_type
1090                ))
1091            })?;
1092            let offset = offsets[i].to_usize().ok_or_else(|| {
1093                ArrowError::InvalidArgumentError(format!(
1094                    "Error converting offset[{}] ({}) to usize for {}",
1095                    i, offsets[i], self.data_type
1096                ))
1097            })?;
1098            if size
1099                .checked_add(offset)
1100                .expect("Offset and size have exceeded the usize boundary")
1101                > values_length
1102            {
1103                return Err(ArrowError::InvalidArgumentError(format!(
1104                    "Size {} at index {} is larger than the remaining values for {}",
1105                    size, i, self.data_type
1106                )));
1107            }
1108        }
1109        Ok(())
1110    }
1111
1112    /// Validates the layout of `child_data` ArrayData structures
1113    fn validate_child_data(&self) -> Result<(), ArrowError> {
1114        match &self.data_type {
1115            DataType::List(field) | DataType::Map(field, _) => {
1116                let values_data = self.get_single_valid_child_data(field.data_type())?;
1117                self.validate_offsets::<i32>(values_data.len)?;
1118                Ok(())
1119            }
1120            DataType::LargeList(field) => {
1121                let values_data = self.get_single_valid_child_data(field.data_type())?;
1122                self.validate_offsets::<i64>(values_data.len)?;
1123                Ok(())
1124            }
1125            DataType::ListView(field) => {
1126                let values_data = self.get_single_valid_child_data(field.data_type())?;
1127                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
1128                Ok(())
1129            }
1130            DataType::LargeListView(field) => {
1131                let values_data = self.get_single_valid_child_data(field.data_type())?;
1132                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1133                Ok(())
1134            }
1135            DataType::FixedSizeList(field, list_size) => {
1136                let values_data = self.get_single_valid_child_data(field.data_type())?;
1137
1138                let list_size: usize = (*list_size).try_into().map_err(|_| {
1139                    ArrowError::InvalidArgumentError(format!(
1140                        "{} has a negative list_size {}",
1141                        self.data_type, list_size
1142                    ))
1143                })?;
1144
1145                let expected_values_len = self.len
1146                    .checked_mul(list_size)
1147                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1148
1149                if values_data.len < expected_values_len {
1150                    return Err(ArrowError::InvalidArgumentError(format!(
1151                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1152                        values_data.len, self.len, list_size, self.data_type
1153                    )));
1154                }
1155
1156                Ok(())
1157            }
1158            DataType::Struct(fields) => {
1159                self.validate_num_child_data(fields.len())?;
1160                for (i, field) in fields.iter().enumerate() {
1161                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1162
1163                    // Ensure child field has sufficient size
1164                    if field_data.len < self.len {
1165                        return Err(ArrowError::InvalidArgumentError(format!(
1166                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1167                            self.data_type,
1168                            i,
1169                            field.name(),
1170                            field_data.len,
1171                            self.len
1172                        )));
1173                    }
1174                }
1175                Ok(())
1176            }
1177            DataType::RunEndEncoded(run_ends_field, values_field) => {
1178                self.validate_num_child_data(2)?;
1179                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1180                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1181                if run_ends_data.len != values_data.len {
1182                    return Err(ArrowError::InvalidArgumentError(format!(
1183                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1184                        run_ends_data.len, values_data.len
1185                    )));
1186                }
1187                if run_ends_data.nulls.is_some() {
1188                    return Err(ArrowError::InvalidArgumentError(
1189                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1190                    ));
1191                }
1192                Ok(())
1193            }
1194            DataType::Union(fields, mode) => {
1195                self.validate_num_child_data(fields.len())?;
1196
1197                for (i, (_, field)) in fields.iter().enumerate() {
1198                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1199
1200                    if mode == &UnionMode::Sparse {
1201                        let len_plus_offset =
1202                            checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1203                        if field_data.len < len_plus_offset {
1204                            return Err(ArrowError::InvalidArgumentError(format!(
1205                                "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1206                                i, field_data.len, len_plus_offset
1207                            )));
1208                        }
1209                    }
1210                }
1211                Ok(())
1212            }
1213            DataType::Dictionary(_key_type, value_type) => {
1214                self.get_single_valid_child_data(value_type)?;
1215                Ok(())
1216            }
1217            _ => {
1218                // other types do not have child data
1219                if !self.child_data.is_empty() {
1220                    return Err(ArrowError::InvalidArgumentError(format!(
1221                        "Expected no child arrays for type {} but got {}",
1222                        self.data_type,
1223                        self.child_data.len()
1224                    )));
1225                }
1226                Ok(())
1227            }
1228        }
1229    }
1230
1231    /// Ensures that this array data has a single child_data with the
1232    /// expected type, and calls `validate()` on it. Returns a
1233    /// reference to that child_data
1234    fn get_single_valid_child_data(
1235        &self,
1236        expected_type: &DataType,
1237    ) -> Result<&ArrayData, ArrowError> {
1238        self.validate_num_child_data(1)?;
1239        self.get_valid_child_data(0, expected_type)
1240    }
1241
1242    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1243    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1244        if self.child_data.len() != expected_len {
1245            Err(ArrowError::InvalidArgumentError(format!(
1246                "Value data for {} should contain {} child data array(s), had {}",
1247                self.data_type,
1248                expected_len,
1249                self.child_data.len()
1250            )))
1251        } else {
1252            Ok(())
1253        }
1254    }
1255
1256    /// Ensures that `child_data[i]` has the expected type, calls
1257    /// `validate()` on it, and returns a reference to that child_data
1258    fn get_valid_child_data(
1259        &self,
1260        i: usize,
1261        expected_type: &DataType,
1262    ) -> Result<&ArrayData, ArrowError> {
1263        let values_data = self.child_data.get(i).ok_or_else(|| {
1264            ArrowError::InvalidArgumentError(format!(
1265                "{} did not have enough child arrays. Expected at least {} but had only {}",
1266                self.data_type,
1267                i + 1,
1268                self.child_data.len()
1269            ))
1270        })?;
1271
1272        if expected_type != &values_data.data_type {
1273            return Err(ArrowError::InvalidArgumentError(format!(
1274                "Child type mismatch for {}. Expected {} but child data had {}",
1275                self.data_type, expected_type, values_data.data_type
1276            )));
1277        }
1278
1279        values_data.validate()?;
1280        Ok(values_data)
1281    }
1282
1283    /// Validate that the data contained within this [`ArrayData`] is valid
1284    ///
1285    /// 1. Null count is correct
1286    /// 2. All offsets are valid
1287    /// 3. All String data is valid UTF-8
1288    /// 4. All dictionary offsets are valid
1289    ///
1290    /// Internally this calls:
1291    ///
1292    /// * [`Self::validate`]
1293    /// * [`Self::validate_nulls`]
1294    /// * [`Self::validate_values`]
1295    ///
1296    /// Note: this does not recurse into children, for a recursive variant
1297    /// see [`Self::validate_full`]
1298    pub fn validate_data(&self) -> Result<(), ArrowError> {
1299        self.validate()?;
1300
1301        self.validate_nulls()?;
1302        self.validate_values()?;
1303        Ok(())
1304    }
1305
1306    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1307    ///
1308    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1309    /// and all its children recursively
1310    pub fn validate_full(&self) -> Result<(), ArrowError> {
1311        self.validate_data()?;
1312        // validate all children recursively
1313        self.child_data
1314            .iter()
1315            .enumerate()
1316            .try_for_each(|(i, child_data)| {
1317                child_data.validate_full().map_err(|e| {
1318                    ArrowError::InvalidArgumentError(format!(
1319                        "{} child #{} invalid: {}",
1320                        self.data_type, i, e
1321                    ))
1322                })
1323            })?;
1324        Ok(())
1325    }
1326
1327    /// Validates the values stored within this [`ArrayData`] are valid
1328    /// without recursing into child [`ArrayData`]
1329    ///
1330    /// Does not (yet) check
1331    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1332    /// 2. the the null count is correct and that any
1333    /// 3. nullability requirements of its children are correct
1334    ///
1335    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1336    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1337        if let Some(nulls) = &self.nulls {
1338            let actual = nulls.len() - nulls.inner().count_set_bits();
1339            if actual != nulls.null_count() {
1340                return Err(ArrowError::InvalidArgumentError(format!(
1341                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1342                    nulls.null_count(),
1343                    actual
1344                )));
1345            }
1346        }
1347
1348        // In general non-nullable children should not contain nulls, however, for certain
1349        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1350        // space in the child. As such we permit nulls in the children in the corresponding
1351        // positions for such types
1352        match &self.data_type {
1353            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1354                if !f.is_nullable() {
1355                    self.validate_non_nullable(None, &self.child_data[0])?
1356                }
1357            }
1358            DataType::FixedSizeList(field, len) => {
1359                let child = &self.child_data[0];
1360                if !field.is_nullable() {
1361                    match &self.nulls {
1362                        Some(nulls) => {
1363                            let element_len = *len as usize;
1364                            let expanded = nulls.expand(element_len);
1365                            self.validate_non_nullable(Some(&expanded), child)?;
1366                        }
1367                        None => self.validate_non_nullable(None, child)?,
1368                    }
1369                }
1370            }
1371            DataType::Struct(fields) => {
1372                for (field, child) in fields.iter().zip(&self.child_data) {
1373                    if !field.is_nullable() {
1374                        self.validate_non_nullable(self.nulls(), child)?
1375                    }
1376                }
1377            }
1378            _ => {}
1379        }
1380
1381        Ok(())
1382    }
1383
1384    /// Verifies that `child` contains no nulls not present in `mask`
1385    fn validate_non_nullable(
1386        &self,
1387        mask: Option<&NullBuffer>,
1388        child: &ArrayData,
1389    ) -> Result<(), ArrowError> {
1390        let mask = match mask {
1391            Some(mask) => mask,
1392            None => {
1393                return match child.null_count() {
1394                    0 => Ok(()),
1395                    _ => Err(ArrowError::InvalidArgumentError(format!(
1396                        "non-nullable child of type {} contains nulls not present in parent {}",
1397                        child.data_type, self.data_type
1398                    ))),
1399                };
1400            }
1401        };
1402
1403        match child.nulls() {
1404            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1405                "non-nullable child of type {} contains nulls not present in parent",
1406                child.data_type
1407            ))),
1408            _ => Ok(()),
1409        }
1410    }
1411
1412    /// Validates the values stored within this [`ArrayData`] are valid
1413    /// without recursing into child [`ArrayData`]
1414    ///
1415    /// Does not (yet) check
1416    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1417    pub fn validate_values(&self) -> Result<(), ArrowError> {
1418        match &self.data_type {
1419            DataType::Utf8 => self.validate_utf8::<i32>(),
1420            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1421            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1422            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1423            DataType::BinaryView => {
1424                let views = self.typed_buffer::<u128>(0, self.len)?;
1425                validate_binary_view(views, &self.buffers[1..])
1426            }
1427            DataType::Utf8View => {
1428                let views = self.typed_buffer::<u128>(0, self.len)?;
1429                validate_string_view(views, &self.buffers[1..])
1430            }
1431            DataType::List(_) | DataType::Map(_, _) => {
1432                let child = &self.child_data[0];
1433                self.validate_offsets_full::<i32>(child.len)
1434            }
1435            DataType::LargeList(_) => {
1436                let child = &self.child_data[0];
1437                self.validate_offsets_full::<i64>(child.len)
1438            }
1439            DataType::Union(_, _) => {
1440                // Validate Union Array as part of implementing new Union semantics
1441                // See comments in `ArrayData::validate()`
1442                // https://github.com/apache/arrow-rs/issues/85
1443                //
1444                // TODO file follow on ticket for full union validation
1445                Ok(())
1446            }
1447            DataType::Dictionary(key_type, _value_type) => {
1448                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1449                let max_value = dictionary_length - 1;
1450                match key_type.as_ref() {
1451                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1452                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1453                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1454                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1455                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1456                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1457                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1458                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1459                    _ => unreachable!(),
1460                }
1461            }
1462            DataType::RunEndEncoded(run_ends, _values) => {
1463                let run_ends_data = self.child_data()[0].clone();
1464                match run_ends.data_type() {
1465                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1466                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1467                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1468                    _ => unreachable!(),
1469                }
1470            }
1471            _ => {
1472                // No extra validation check required for other types
1473                Ok(())
1474            }
1475        }
1476    }
1477
1478    /// Calls the `validate(item_index, range)` function for each of
1479    /// the ranges specified in the arrow offsets buffer of type
1480    /// `T`. Also validates that each offset is smaller than
1481    /// `offset_limit`
1482    ///
1483    /// For an empty array, the offsets buffer can either be empty
1484    /// or contain a single `0`.
1485    ///
1486    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1487    /// function would call `validate([1,2])`, and `validate([2,4])`
1488    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1489    where
1490        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1491        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1492    {
1493        self.typed_offsets::<T>()?
1494            .iter()
1495            .enumerate()
1496            .map(|(i, x)| {
1497                // check if the offset can be converted to usize
1498                let r = x.to_usize().ok_or_else(|| {
1499                    ArrowError::InvalidArgumentError(format!(
1500                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1501                    );
1502                // check if the offset exceeds the limit
1503                match r {
1504                    Ok(n) if n <= offset_limit => Ok((i, n)),
1505                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1506                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1507                    ),
1508                    Err(e) => Err(e),
1509                }
1510            })
1511            .scan(0_usize, |start, end| {
1512                // check offsets are monotonically increasing
1513                match end {
1514                    Ok((i, end)) if *start <= end => {
1515                        let range = Some(Ok((i, *start..end)));
1516                        *start = end;
1517                        range
1518                    }
1519                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1520                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1521                        i - 1, start, end))
1522                    )),
1523                    Err(err) => Some(Err(err)),
1524                }
1525            })
1526            .skip(1) // the first element is meaningless
1527            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1528                let (item_index, range) = res?;
1529                validate(item_index-1, range)
1530            })
1531    }
1532
1533    /// Ensures that all strings formed by the offsets in `buffers[0]`
1534    /// into `buffers[1]` are valid utf8 sequences
1535    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1536    where
1537        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1538    {
1539        let values_buffer = &self.buffers[1].as_slice();
1540        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1541            // Validate Offsets are correct
1542            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1543                if !values_str.is_char_boundary(range.start)
1544                    || !values_str.is_char_boundary(range.end)
1545                {
1546                    return Err(ArrowError::InvalidArgumentError(format!(
1547                        "incomplete utf-8 byte sequence from index {string_index}"
1548                    )));
1549                }
1550                Ok(())
1551            })
1552        } else {
1553            // find specific offset that failed utf8 validation
1554            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1555                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1556                    ArrowError::InvalidArgumentError(format!(
1557                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1558                    ))
1559                })?;
1560                Ok(())
1561            })
1562        }
1563    }
1564
1565    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1566    /// between `0` and `offset_limit`
1567    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1568    where
1569        T: ArrowNativeType + TryInto<usize> + num_traits::Num + std::fmt::Display,
1570    {
1571        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1572            // No validation applied to each value, but the iteration
1573            // itself applies bounds checking to each range
1574            Ok(())
1575        })
1576    }
1577
1578    /// Validates that each value in self.buffers (typed as T)
1579    /// is within the range [0, max_value], inclusive
1580    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1581    where
1582        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1583    {
1584        let required_len = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1585        let buffer = &self.buffers[0];
1586
1587        // This should have been checked as part of `validate()` prior
1588        // to calling `validate_full()` but double check to be sure
1589        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1590
1591        // Justification: buffer size was validated above
1592        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..required_len];
1593
1594        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1595            // Do not check the value is null (value can be arbitrary)
1596            if self.is_null(i) {
1597                return Ok(());
1598            }
1599            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1600                ArrowError::InvalidArgumentError(format!(
1601                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1602                ))
1603            })?;
1604
1605            if dict_index < 0 || dict_index > max_value {
1606                return Err(ArrowError::InvalidArgumentError(format!(
1607                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1608                )));
1609            }
1610            Ok(())
1611        })
1612    }
1613
1614    /// Validates that each value in run_ends array is positive and strictly increasing.
1615    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1616    where
1617        T: ArrowNativeType + TryInto<i64> + num_traits::Num + std::fmt::Display,
1618    {
1619        let values = self.typed_buffer::<T>(0, self.len)?;
1620        let mut prev_value: i64 = 0_i64;
1621        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1622            let value: i64 = inp_value.try_into().map_err(|_| {
1623                ArrowError::InvalidArgumentError(format!(
1624                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1625                ))
1626            })?;
1627            if value <= 0_i64 {
1628                return Err(ArrowError::InvalidArgumentError(format!(
1629                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1630                )));
1631            }
1632            if ix > 0 && value <= prev_value {
1633                return Err(ArrowError::InvalidArgumentError(format!(
1634                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1635                )));
1636            }
1637
1638            prev_value = value;
1639            Ok(())
1640        })?;
1641
1642        let len_plus_offset = checked_len_plus_offset(&self.data_type, self.len, self.offset)?;
1643        if prev_value.as_usize() < len_plus_offset {
1644            return Err(ArrowError::InvalidArgumentError(format!(
1645                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1646                len_plus_offset
1647            )));
1648        }
1649        Ok(())
1650    }
1651
1652    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1653    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1654    /// return false when the arrays are logically equal
1655    pub fn ptr_eq(&self, other: &Self) -> bool {
1656        if self.offset != other.offset
1657            || self.len != other.len
1658            || self.data_type != other.data_type
1659            || self.buffers.len() != other.buffers.len()
1660            || self.child_data.len() != other.child_data.len()
1661        {
1662            return false;
1663        }
1664
1665        match (&self.nulls, &other.nulls) {
1666            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1667            (Some(_), None) | (None, Some(_)) => return false,
1668            _ => {}
1669        };
1670
1671        if !self
1672            .buffers
1673            .iter()
1674            .zip(other.buffers.iter())
1675            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1676        {
1677            return false;
1678        }
1679
1680        self.child_data
1681            .iter()
1682            .zip(other.child_data.iter())
1683            .all(|(a, b)| a.ptr_eq(b))
1684    }
1685
1686    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1687    pub fn into_builder(self) -> ArrayDataBuilder {
1688        self.into()
1689    }
1690
1691    /// Claim memory used by this ArrayData in the provided memory pool.
1692    ///
1693    /// This claims memory for:
1694    /// - All buffers in self.buffers
1695    /// - All child ArrayData recursively
1696    /// - The null buffer if present
1697    #[cfg(feature = "pool")]
1698    pub fn claim(&self, pool: &dyn arrow_buffer::MemoryPool) {
1699        // Claim all data buffers
1700        for buffer in &self.buffers {
1701            buffer.claim(pool);
1702        }
1703
1704        // Claim null buffer if present
1705        if let Some(nulls) = &self.nulls {
1706            nulls.claim(pool);
1707        }
1708
1709        // Recursively claim child data
1710        for child in &self.child_data {
1711            child.claim(pool);
1712        }
1713    }
1714}
1715
1716/// Return the expected [`DataTypeLayout`] Arrays of this data
1717/// type are expected to have
1718pub fn layout(data_type: &DataType) -> DataTypeLayout {
1719    // based on C/C++ implementation in
1720    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1721    use arrow_schema::IntervalUnit::*;
1722
1723    match data_type {
1724        DataType::Null => DataTypeLayout {
1725            buffers: vec![],
1726            can_contain_null_mask: false,
1727            variadic: false,
1728        },
1729        DataType::Boolean => DataTypeLayout {
1730            buffers: vec![BufferSpec::BitMap],
1731            can_contain_null_mask: true,
1732            variadic: false,
1733        },
1734        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1735        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1736        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1737        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1738        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1739        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1740        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1741        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1742        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1743        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1744        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1745        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1746        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1747        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1748        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1749        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1750        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1751        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1752        DataType::Interval(MonthDayNano) => {
1753            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1754        }
1755        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1756        DataType::Decimal32(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1757        DataType::Decimal64(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1758        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1759        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1760        DataType::FixedSizeBinary(size) => {
1761            let spec = BufferSpec::FixedWidth {
1762                byte_width: (*size).try_into().unwrap(),
1763                alignment: mem::align_of::<u8>(),
1764            };
1765            DataTypeLayout {
1766                buffers: vec![spec],
1767                can_contain_null_mask: true,
1768                variadic: false,
1769            }
1770        }
1771        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1772        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1773        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1774        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1775        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1776        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1777        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1778        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1779        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1780        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1781        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1782        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1783        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1784        DataType::Union(_, mode) => {
1785            let type_ids = BufferSpec::FixedWidth {
1786                byte_width: mem::size_of::<i8>(),
1787                alignment: mem::align_of::<i8>(),
1788            };
1789
1790            DataTypeLayout {
1791                buffers: match mode {
1792                    UnionMode::Sparse => {
1793                        vec![type_ids]
1794                    }
1795                    UnionMode::Dense => {
1796                        vec![
1797                            type_ids,
1798                            BufferSpec::FixedWidth {
1799                                byte_width: mem::size_of::<i32>(),
1800                                alignment: mem::align_of::<i32>(),
1801                            },
1802                        ]
1803                    }
1804                },
1805                can_contain_null_mask: false,
1806                variadic: false,
1807            }
1808        }
1809        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1810    }
1811}
1812
1813/// Layout specification for a data type
1814#[derive(Debug, PartialEq, Eq)]
1815// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1816pub struct DataTypeLayout {
1817    /// A vector of buffer layout specifications, one for each expected buffer
1818    pub buffers: Vec<BufferSpec>,
1819
1820    /// Can contain a null bitmask
1821    pub can_contain_null_mask: bool,
1822
1823    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1824    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1825    /// buffers.len(). Buffers that exceed the lower bound are legal.
1826    pub variadic: bool,
1827}
1828
1829impl DataTypeLayout {
1830    /// Describes a basic numeric array where each element has type `T`
1831    pub fn new_fixed_width<T>() -> Self {
1832        Self {
1833            buffers: vec![BufferSpec::FixedWidth {
1834                byte_width: mem::size_of::<T>(),
1835                alignment: mem::align_of::<T>(),
1836            }],
1837            can_contain_null_mask: true,
1838            variadic: false,
1839        }
1840    }
1841
1842    /// Describes arrays which have no data of their own
1843    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1844    pub fn new_nullable_empty() -> Self {
1845        Self {
1846            buffers: vec![],
1847            can_contain_null_mask: true,
1848            variadic: false,
1849        }
1850    }
1851
1852    /// Describes arrays which have no data of their own
1853    /// (e.g. RunEndEncoded).
1854    pub fn new_empty() -> Self {
1855        Self {
1856            buffers: vec![],
1857            can_contain_null_mask: false,
1858            variadic: false,
1859        }
1860    }
1861
1862    /// Describes a basic numeric array where each element has a fixed
1863    /// with offset buffer of type `T`, followed by a
1864    /// variable width data buffer
1865    pub fn new_binary<T>() -> Self {
1866        Self {
1867            buffers: vec![
1868                // offsets
1869                BufferSpec::FixedWidth {
1870                    byte_width: mem::size_of::<T>(),
1871                    alignment: mem::align_of::<T>(),
1872                },
1873                // values
1874                BufferSpec::VariableWidth,
1875            ],
1876            can_contain_null_mask: true,
1877            variadic: false,
1878        }
1879    }
1880
1881    /// Describes a view type
1882    pub fn new_view() -> Self {
1883        Self {
1884            buffers: vec![BufferSpec::FixedWidth {
1885                byte_width: mem::size_of::<u128>(),
1886                alignment: mem::align_of::<u128>(),
1887            }],
1888            can_contain_null_mask: true,
1889            variadic: true,
1890        }
1891    }
1892
1893    /// Describes a list view type
1894    pub fn new_list_view<T>() -> Self {
1895        Self {
1896            buffers: vec![
1897                BufferSpec::FixedWidth {
1898                    byte_width: mem::size_of::<T>(),
1899                    alignment: mem::align_of::<T>(),
1900                },
1901                BufferSpec::FixedWidth {
1902                    byte_width: mem::size_of::<T>(),
1903                    alignment: mem::align_of::<T>(),
1904                },
1905            ],
1906            can_contain_null_mask: true,
1907            variadic: false,
1908        }
1909    }
1910}
1911
1912/// Layout specification for a single data type buffer
1913#[derive(Debug, PartialEq, Eq)]
1914pub enum BufferSpec {
1915    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1916    ///
1917    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1918    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1919    ///
1920    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1921    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1922    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1923    ///
1924    /// Note that these alignment requirements will vary between architectures
1925    FixedWidth {
1926        /// The width of each element in bytes
1927        byte_width: usize,
1928        /// The alignment required by Rust for an array of the corresponding primitive
1929        alignment: usize,
1930    },
1931    /// Variable width, such as string data for utf8 data
1932    VariableWidth,
1933    /// Buffer holds a bitmap.
1934    ///
1935    /// Note: Unlike the C++ implementation, the null/validity buffer
1936    /// is handled specially rather than as another of the buffers in
1937    /// the spec, so this variant is only used for the Boolean type.
1938    BitMap,
1939    /// Buffer is always null. Unused currently in Rust implementation,
1940    /// (used in C++ for Union type)
1941    #[allow(dead_code)]
1942    AlwaysNull,
1943}
1944
1945impl PartialEq for ArrayData {
1946    fn eq(&self, other: &Self) -> bool {
1947        equal::equal(self, other)
1948    }
1949}
1950
1951/// A boolean flag that cannot be mutated outside of unsafe code.
1952///
1953/// Defaults to a value of false.
1954///
1955/// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1956///
1957/// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1958///
1959/// # Example
1960/// ```rust
1961/// use arrow_data::UnsafeFlag;
1962/// assert!(!UnsafeFlag::default().get()); // default is false
1963/// let mut flag = UnsafeFlag::new();
1964/// assert!(!flag.get()); // defaults to false
1965/// // can only set it to true in unsafe code
1966/// unsafe { flag.set(true) };
1967/// assert!(flag.get()); // now true
1968/// ```
1969#[derive(Debug, Clone)]
1970#[doc(hidden)]
1971pub struct UnsafeFlag(bool);
1972
1973impl UnsafeFlag {
1974    /// Creates a new `UnsafeFlag` with the value set to `false`.
1975    ///
1976    /// See examples on [`Self::new`]
1977    #[inline]
1978    pub const fn new() -> Self {
1979        Self(false)
1980    }
1981
1982    /// Sets the value of the flag to the given value
1983    ///
1984    /// Note this can purposely only be done in `unsafe` code
1985    ///
1986    /// # Safety
1987    ///
1988    /// If set, the flag will be set to the given value. There is nothing
1989    /// immediately unsafe about doing so, however, the flag can be used to
1990    /// subsequently bypass safety checks in the [`ArrayDataBuilder`].
1991    #[inline]
1992    pub unsafe fn set(&mut self, val: bool) {
1993        self.0 = val;
1994    }
1995
1996    /// Returns the value of the flag
1997    #[inline]
1998    pub fn get(&self) -> bool {
1999        self.0
2000    }
2001}
2002
2003// Manual impl to make it clear you can not construct unsafe with true
2004impl Default for UnsafeFlag {
2005    fn default() -> Self {
2006        Self::new()
2007    }
2008}
2009
2010/// Builder for [`ArrayData`] type
2011#[derive(Debug)]
2012pub struct ArrayDataBuilder {
2013    data_type: DataType,
2014    len: usize,
2015    null_count: Option<usize>,
2016    null_bit_buffer: Option<Buffer>,
2017    nulls: Option<NullBuffer>,
2018    offset: usize,
2019    buffers: Vec<Buffer>,
2020    child_data: Vec<ArrayData>,
2021    /// Should buffers be realigned (copying if necessary)?
2022    ///
2023    /// Defaults to false.
2024    align_buffers: bool,
2025    /// Should data validation be skipped for this [`ArrayData`]?
2026    ///
2027    /// Defaults to false.
2028    ///
2029    /// # Safety
2030    ///
2031    /// This flag can only be set to true using `unsafe` APIs. However, once true
2032    /// subsequent calls to `build()` may result in undefined behavior if the data
2033    /// is not valid.
2034    skip_validation: UnsafeFlag,
2035}
2036
2037impl ArrayDataBuilder {
2038    #[inline]
2039    /// Creates a new array data builder
2040    pub const fn new(data_type: DataType) -> Self {
2041        Self {
2042            data_type,
2043            len: 0,
2044            null_count: None,
2045            null_bit_buffer: None,
2046            nulls: None,
2047            offset: 0,
2048            buffers: vec![],
2049            child_data: vec![],
2050            align_buffers: false,
2051            skip_validation: UnsafeFlag::new(),
2052        }
2053    }
2054
2055    /// Creates a new array data builder from an existing one, changing the data type
2056    pub fn data_type(self, data_type: DataType) -> Self {
2057        Self { data_type, ..self }
2058    }
2059
2060    #[inline]
2061    #[allow(clippy::len_without_is_empty)]
2062    /// Sets the length of the [ArrayData]
2063    pub const fn len(mut self, n: usize) -> Self {
2064        self.len = n;
2065        self
2066    }
2067
2068    /// Sets the null buffer of the [ArrayData]
2069    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
2070        self.nulls = nulls;
2071        self.null_count = None;
2072        self.null_bit_buffer = None;
2073        self
2074    }
2075
2076    /// Sets the null count of the [ArrayData]
2077    pub fn null_count(mut self, null_count: usize) -> Self {
2078        self.null_count = Some(null_count);
2079        self
2080    }
2081
2082    /// Sets the `null_bit_buffer` of the [ArrayData]
2083    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
2084        self.nulls = None;
2085        self.null_bit_buffer = buf;
2086        self
2087    }
2088
2089    /// Sets the offset of the [ArrayData]
2090    #[inline]
2091    pub const fn offset(mut self, n: usize) -> Self {
2092        self.offset = n;
2093        self
2094    }
2095
2096    /// Sets the buffers of the [ArrayData]
2097    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
2098        self.buffers = v;
2099        self
2100    }
2101
2102    /// Adds a single buffer to the [ArrayData]'s buffers
2103    pub fn add_buffer(mut self, b: Buffer) -> Self {
2104        self.buffers.push(b);
2105        self
2106    }
2107
2108    /// Adds multiple buffers to the [ArrayData]'s buffers
2109    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
2110        self.buffers.extend(bs);
2111        self
2112    }
2113
2114    /// Sets the child data of the [ArrayData]
2115    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
2116        self.child_data = v;
2117        self
2118    }
2119
2120    /// Adds a single child data to the [ArrayData]'s child data
2121    pub fn add_child_data(mut self, r: ArrayData) -> Self {
2122        self.child_data.push(r);
2123        self
2124    }
2125
2126    /// Creates an array data, without any validation
2127    ///
2128    /// Note: This is shorthand for
2129    /// ```rust
2130    /// # #[expect(unsafe_op_in_unsafe_fn)]
2131    /// # let mut builder = arrow_data::ArrayDataBuilder::new(arrow_schema::DataType::Null);
2132    /// # let _ = unsafe {
2133    /// builder.skip_validation(true).build().unwrap()
2134    /// # };
2135    /// ```
2136    ///
2137    /// # Safety
2138    ///
2139    /// The same caveats as [`ArrayData::new_unchecked`]
2140    /// apply.
2141    pub unsafe fn build_unchecked(self) -> ArrayData {
2142        unsafe { self.skip_validation(true) }.build().unwrap()
2143    }
2144
2145    /// Creates an `ArrayData`, consuming `self`
2146    ///
2147    /// # Safety
2148    ///
2149    /// By default the underlying buffers are checked to ensure they are valid
2150    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
2151    /// to true (by the `unsafe` API) this validation is skipped. If the data is
2152    /// not valid, undefined behavior will result.
2153    pub fn build(self) -> Result<ArrayData, ArrowError> {
2154        let Self {
2155            data_type,
2156            len,
2157            null_count,
2158            null_bit_buffer,
2159            nulls,
2160            offset,
2161            buffers,
2162            child_data,
2163            align_buffers,
2164            skip_validation,
2165        } = self;
2166
2167        let nulls = nulls
2168            .or_else(|| {
2169                let buffer = null_bit_buffer?;
2170                let buffer = BooleanBuffer::new(buffer, offset, len);
2171                Some(match null_count {
2172                    Some(n) => {
2173                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
2174                        unsafe { NullBuffer::new_unchecked(buffer, n) }
2175                    }
2176                    None => NullBuffer::new(buffer),
2177                })
2178            })
2179            .filter(|b| b.null_count() != 0);
2180
2181        let mut data = ArrayData {
2182            data_type,
2183            len,
2184            offset,
2185            buffers,
2186            child_data,
2187            nulls,
2188        };
2189
2190        if align_buffers {
2191            data.align_buffers();
2192        }
2193
2194        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
2195        if !skip_validation.get() || cfg!(feature = "force_validate") {
2196            data.validate_data()?;
2197        }
2198        Ok(data)
2199    }
2200
2201    /// Creates an array data, validating all inputs, and aligning any buffers
2202    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2203    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2204        self.align_buffers(true).build()
2205    }
2206
2207    /// Ensure that all buffers are aligned, copying data if necessary
2208    ///
2209    /// Rust requires that arrays are aligned to their corresponding primitive,
2210    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2211    ///
2212    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2213    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2214    ///
2215    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2216    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2217    /// when necessary, making it useful when interacting with buffers produced by other systems,
2218    /// e.g. IPC or FFI.
2219    ///
2220    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2221    /// insufficiently aligned buffers.
2222    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2223        self.align_buffers = align_buffers;
2224        self
2225    }
2226
2227    /// Skips validation of the data.
2228    ///
2229    /// If this flag is enabled, `[Self::build`] will skip validation of the
2230    /// data
2231    ///
2232    /// If this flag is not enabled, `[Self::build`] will validate that all
2233    /// buffers are valid and will return an error if any data is invalid.
2234    /// Validation can be expensive.
2235    ///
2236    /// # Safety
2237    ///
2238    /// If validation is skipped, the buffers must form a valid Arrow array,
2239    /// otherwise undefined behavior will result
2240    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2241        unsafe {
2242            self.skip_validation.set(skip_validation);
2243        }
2244        self
2245    }
2246}
2247
2248impl From<ArrayData> for ArrayDataBuilder {
2249    fn from(d: ArrayData) -> Self {
2250        Self {
2251            data_type: d.data_type,
2252            len: d.len,
2253            offset: d.offset,
2254            buffers: d.buffers,
2255            child_data: d.child_data,
2256            nulls: d.nulls,
2257            null_bit_buffer: None,
2258            null_count: None,
2259            align_buffers: false,
2260            skip_validation: UnsafeFlag::new(),
2261        }
2262    }
2263}
2264
2265#[cfg(test)]
2266mod tests {
2267    use super::*;
2268    use arrow_schema::{Field, Fields};
2269
2270    // See arrow/tests/array_data_validation.rs for test of array validation
2271
2272    /// returns a buffer initialized with some constant value for tests
2273    fn make_i32_buffer(n: usize) -> Buffer {
2274        Buffer::from_slice_ref(vec![42i32; n])
2275    }
2276
2277    /// returns a buffer initialized with some constant value for tests
2278    fn make_f32_buffer(n: usize) -> Buffer {
2279        Buffer::from_slice_ref(vec![42f32; n])
2280    }
2281
2282    #[test]
2283    fn test_builder() {
2284        // Buffer needs to be at least 25 long
2285        let v = (0..25).collect::<Vec<i32>>();
2286        let b1 = Buffer::from_slice_ref(&v);
2287        let arr_data = ArrayData::builder(DataType::Int32)
2288            .len(20)
2289            .offset(5)
2290            .add_buffer(b1)
2291            .null_bit_buffer(Some(Buffer::from([
2292                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2293            ])))
2294            .build()
2295            .unwrap();
2296
2297        assert_eq!(20, arr_data.len());
2298        assert_eq!(10, arr_data.null_count());
2299        assert_eq!(5, arr_data.offset());
2300        assert_eq!(1, arr_data.buffers().len());
2301        assert_eq!(
2302            Buffer::from_slice_ref(&v).as_slice(),
2303            arr_data.buffers()[0].as_slice()
2304        );
2305    }
2306
2307    #[test]
2308    fn test_builder_with_child_data() {
2309        let child_arr_data = ArrayData::try_new(
2310            DataType::Int32,
2311            5,
2312            None,
2313            0,
2314            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2315            vec![],
2316        )
2317        .unwrap();
2318
2319        let field = Arc::new(Field::new("x", DataType::Int32, true));
2320        let data_type = DataType::Struct(vec![field].into());
2321
2322        let arr_data = ArrayData::builder(data_type)
2323            .len(5)
2324            .offset(0)
2325            .add_child_data(child_arr_data.clone())
2326            .build()
2327            .unwrap();
2328
2329        assert_eq!(5, arr_data.len());
2330        assert_eq!(1, arr_data.child_data().len());
2331        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2332    }
2333
2334    #[test]
2335    fn test_null_count() {
2336        let mut bit_v: [u8; 2] = [0; 2];
2337        bit_util::set_bit(&mut bit_v, 0);
2338        bit_util::set_bit(&mut bit_v, 3);
2339        bit_util::set_bit(&mut bit_v, 10);
2340        let arr_data = ArrayData::builder(DataType::Int32)
2341            .len(16)
2342            .add_buffer(make_i32_buffer(16))
2343            .null_bit_buffer(Some(Buffer::from(bit_v)))
2344            .build()
2345            .unwrap();
2346        assert_eq!(13, arr_data.null_count());
2347
2348        // Test with offset
2349        let mut bit_v: [u8; 2] = [0; 2];
2350        bit_util::set_bit(&mut bit_v, 0);
2351        bit_util::set_bit(&mut bit_v, 3);
2352        bit_util::set_bit(&mut bit_v, 10);
2353        let arr_data = ArrayData::builder(DataType::Int32)
2354            .len(12)
2355            .offset(2)
2356            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2357            .null_bit_buffer(Some(Buffer::from(bit_v)))
2358            .build()
2359            .unwrap();
2360        assert_eq!(10, arr_data.null_count());
2361    }
2362
2363    #[test]
2364    fn test_null_buffer_ref() {
2365        let mut bit_v: [u8; 2] = [0; 2];
2366        bit_util::set_bit(&mut bit_v, 0);
2367        bit_util::set_bit(&mut bit_v, 3);
2368        bit_util::set_bit(&mut bit_v, 10);
2369        let arr_data = ArrayData::builder(DataType::Int32)
2370            .len(16)
2371            .add_buffer(make_i32_buffer(16))
2372            .null_bit_buffer(Some(Buffer::from(bit_v)))
2373            .build()
2374            .unwrap();
2375        assert!(arr_data.nulls().is_some());
2376        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2377    }
2378
2379    #[test]
2380    fn test_slice() {
2381        let mut bit_v: [u8; 2] = [0; 2];
2382        bit_util::set_bit(&mut bit_v, 0);
2383        bit_util::set_bit(&mut bit_v, 3);
2384        bit_util::set_bit(&mut bit_v, 10);
2385        let data = ArrayData::builder(DataType::Int32)
2386            .len(16)
2387            .add_buffer(make_i32_buffer(16))
2388            .null_bit_buffer(Some(Buffer::from(bit_v)))
2389            .build()
2390            .unwrap();
2391        let new_data = data.slice(1, 15);
2392        assert_eq!(data.len() - 1, new_data.len());
2393        assert_eq!(1, new_data.offset());
2394        assert_eq!(data.null_count(), new_data.null_count());
2395
2396        // slice of a slice (removes one null)
2397        let new_data = new_data.slice(1, 14);
2398        assert_eq!(data.len() - 2, new_data.len());
2399        assert_eq!(2, new_data.offset());
2400        assert_eq!(data.null_count() - 1, new_data.null_count());
2401    }
2402
2403    #[test]
2404    #[should_panic(expected = "offset + length overflow")]
2405    fn test_slice_panics_on_offset_length_overflow() {
2406        let data = ArrayData::builder(DataType::Int32)
2407            .len(4)
2408            .add_buffer(make_i32_buffer(4))
2409            .build()
2410            .unwrap();
2411        let sliced = data.slice(1, 3);
2412
2413        sliced.slice(1, usize::MAX);
2414    }
2415
2416    #[test]
2417    fn test_typed_offsets_length_overflow() {
2418        let data = ArrayData {
2419            data_type: DataType::Binary,
2420            len: usize::MAX,
2421            offset: 0,
2422            buffers: vec![Buffer::from_slice_ref([0_i32])],
2423            child_data: vec![],
2424            nulls: None,
2425        };
2426        let err = data.typed_offsets::<i32>().unwrap_err();
2427
2428        assert_eq!(
2429            err.to_string(),
2430            format!(
2431                "Invalid argument error: Length {} with offset 1 overflows usize for Binary",
2432                usize::MAX
2433            )
2434        );
2435    }
2436
2437    #[test]
2438    fn test_validate_typed_buffer_length_overflow() {
2439        let data = ArrayData {
2440            data_type: DataType::Binary,
2441            len: 0,
2442            offset: 2,
2443            buffers: vec![Buffer::from_slice_ref([0_i32])],
2444            child_data: vec![],
2445            nulls: None,
2446        };
2447        let err = data.typed_buffer::<i32>(0, usize::MAX).unwrap_err();
2448
2449        assert_eq!(
2450            err.to_string(),
2451            format!(
2452                "Invalid argument error: Length {} with offset 2 overflows usize for Binary",
2453                usize::MAX
2454            )
2455        );
2456    }
2457
2458    // Exercises ArrayData::try_new with len + offset overflowing
2459    fn try_new_binary_length_offset_overflow() -> Result<ArrayData, ArrowError> {
2460        ArrayData::try_new(
2461            DataType::Binary,
2462            usize::MAX,
2463            None,
2464            1,
2465            vec![
2466                Buffer::from_slice_ref([0_i32]),
2467                Buffer::from_iter(std::iter::empty::<u8>()),
2468            ],
2469            vec![],
2470        )
2471    }
2472
2473    #[cfg(not(feature = "force_validate"))]
2474    #[test]
2475    fn test_try_new_length_offset_overflow() {
2476        let err = try_new_binary_length_offset_overflow().unwrap_err();
2477
2478        assert_eq!(
2479            err.to_string(),
2480            format!(
2481                "Invalid argument error: Length {} with offset 1 overflows usize for Binary",
2482                usize::MAX
2483            )
2484        );
2485    }
2486
2487    #[cfg(feature = "force_validate")]
2488    #[test]
2489    #[should_panic(
2490        expected = "Length 18446744073709551615 with offset 1 overflows usize for Binary"
2491    )]
2492    fn test_try_new_length_offset_overflow_force_validate() {
2493        try_new_binary_length_offset_overflow().unwrap();
2494    }
2495
2496    #[test]
2497    fn test_equality() {
2498        let int_data = ArrayData::builder(DataType::Int32)
2499            .len(1)
2500            .add_buffer(make_i32_buffer(1))
2501            .build()
2502            .unwrap();
2503
2504        let float_data = ArrayData::builder(DataType::Float32)
2505            .len(1)
2506            .add_buffer(make_f32_buffer(1))
2507            .build()
2508            .unwrap();
2509        assert_ne!(int_data, float_data);
2510        assert!(!int_data.ptr_eq(&float_data));
2511        assert!(int_data.ptr_eq(&int_data));
2512
2513        #[allow(clippy::redundant_clone)]
2514        let int_data_clone = int_data.clone();
2515        assert_eq!(int_data, int_data_clone);
2516        assert!(int_data.ptr_eq(&int_data_clone));
2517        assert!(int_data_clone.ptr_eq(&int_data));
2518
2519        let int_data_slice = int_data_clone.slice(1, 0);
2520        assert!(int_data_slice.ptr_eq(&int_data_slice));
2521        assert!(!int_data.ptr_eq(&int_data_slice));
2522        assert!(!int_data_slice.ptr_eq(&int_data));
2523
2524        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2525        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2526        let string_data = ArrayData::try_new(
2527            DataType::Utf8,
2528            3,
2529            Some(Buffer::from_iter(vec![true, false, true])),
2530            0,
2531            vec![offsets_buffer, data_buffer],
2532            vec![],
2533        )
2534        .unwrap();
2535
2536        assert_ne!(float_data, string_data);
2537        assert!(!float_data.ptr_eq(&string_data));
2538
2539        assert!(string_data.ptr_eq(&string_data));
2540
2541        #[allow(clippy::redundant_clone)]
2542        let string_data_cloned = string_data.clone();
2543        assert!(string_data_cloned.ptr_eq(&string_data));
2544        assert!(string_data.ptr_eq(&string_data_cloned));
2545
2546        let string_data_slice = string_data.slice(1, 2);
2547        assert!(string_data_slice.ptr_eq(&string_data_slice));
2548        assert!(!string_data_slice.ptr_eq(&string_data))
2549    }
2550
2551    #[test]
2552    fn test_slice_memory_size() {
2553        let mut bit_v: [u8; 2] = [0; 2];
2554        bit_util::set_bit(&mut bit_v, 0);
2555        bit_util::set_bit(&mut bit_v, 3);
2556        bit_util::set_bit(&mut bit_v, 10);
2557        let data = ArrayData::builder(DataType::Int32)
2558            .len(16)
2559            .add_buffer(make_i32_buffer(16))
2560            .null_bit_buffer(Some(Buffer::from(bit_v)))
2561            .build()
2562            .unwrap();
2563        let new_data = data.slice(1, 14);
2564        assert_eq!(
2565            data.get_slice_memory_size().unwrap() - 8,
2566            new_data.get_slice_memory_size().unwrap()
2567        );
2568        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2569        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2570        let string_data = ArrayData::try_new(
2571            DataType::Utf8,
2572            3,
2573            Some(Buffer::from_iter(vec![true, false, true])),
2574            0,
2575            vec![offsets_buffer, data_buffer],
2576            vec![],
2577        )
2578        .unwrap();
2579        let string_data_slice = string_data.slice(1, 2);
2580        //4 bytes of offset and 2 bytes of data reduced by slicing.
2581        assert_eq!(
2582            string_data.get_slice_memory_size().unwrap() - 6,
2583            string_data_slice.get_slice_memory_size().unwrap()
2584        );
2585    }
2586
2587    #[test]
2588    fn test_count_nulls() {
2589        let buffer = Buffer::from([0b00010110, 0b10011111]);
2590        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2591        let count = count_nulls(Some(&buffer), 0, 16);
2592        assert_eq!(count, 7);
2593
2594        let count = count_nulls(Some(&buffer), 4, 8);
2595        assert_eq!(count, 3);
2596    }
2597
2598    #[test]
2599    fn test_contains_nulls() {
2600        let buffer: Buffer =
2601            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2602        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2603        assert!(contains_nulls(Some(&buffer), 0, 6));
2604        assert!(contains_nulls(Some(&buffer), 0, 3));
2605        assert!(!contains_nulls(Some(&buffer), 3, 2));
2606        assert!(!contains_nulls(Some(&buffer), 0, 0));
2607    }
2608
2609    #[test]
2610    fn test_alignment() {
2611        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2612        let sliced = buffer.slice(1);
2613
2614        let mut data = ArrayData {
2615            data_type: DataType::Int32,
2616            len: 0,
2617            offset: 0,
2618            buffers: vec![buffer],
2619            child_data: vec![],
2620            nulls: None,
2621        };
2622        data.validate_full().unwrap();
2623
2624        // break alignment in data
2625        data.buffers[0] = sliced;
2626        let err = data.validate().unwrap_err();
2627
2628        assert_eq!(
2629            err.to_string(),
2630            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2631        );
2632
2633        data.align_buffers();
2634        data.validate_full().unwrap();
2635    }
2636
2637    #[test]
2638    fn test_alignment_struct() {
2639        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2640        let sliced = buffer.slice(1);
2641
2642        let child_data = ArrayData {
2643            data_type: DataType::Int32,
2644            len: 0,
2645            offset: 0,
2646            buffers: vec![buffer],
2647            child_data: vec![],
2648            nulls: None,
2649        };
2650
2651        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2652        let mut data = ArrayData {
2653            data_type: schema,
2654            len: 0,
2655            offset: 0,
2656            buffers: vec![],
2657            child_data: vec![child_data],
2658            nulls: None,
2659        };
2660        data.validate_full().unwrap();
2661
2662        // break alignment in child data
2663        data.child_data[0].buffers[0] = sliced;
2664        let err = data.validate().unwrap_err();
2665
2666        assert_eq!(
2667            err.to_string(),
2668            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2669        );
2670
2671        data.align_buffers();
2672        data.validate_full().unwrap();
2673    }
2674
2675    #[test]
2676    fn test_null_view_types() {
2677        let array_len = 32;
2678        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2679        assert_eq!(array.len(), array_len);
2680        for i in 0..array.len() {
2681            assert!(array.is_null(i));
2682        }
2683
2684        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2685        assert_eq!(array.len(), array_len);
2686        for i in 0..array.len() {
2687            assert!(array.is_null(i));
2688        }
2689
2690        let array = ArrayData::new_null(
2691            &DataType::ListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2692            array_len,
2693        );
2694        assert_eq!(array.len(), array_len);
2695        for i in 0..array.len() {
2696            assert!(array.is_null(i));
2697        }
2698
2699        let array = ArrayData::new_null(
2700            &DataType::LargeListView(Arc::new(Field::new_list_field(DataType::Int32, true))),
2701            array_len,
2702        );
2703        assert_eq!(array.len(), array_len);
2704        for i in 0..array.len() {
2705            assert!(array.is_null(i));
2706        }
2707    }
2708}