vortex_array/arrays/struct_/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5use std::iter::once;
6use std::sync::Arc;
7
8use vortex_dtype::{DType, FieldName, FieldNames, StructFields};
9use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
10
11use crate::stats::ArrayStats;
12use crate::validity::Validity;
13use crate::vtable::ValidityHelper;
14use crate::{Array, ArrayRef, IntoArray};
15
16/// A struct array that stores multiple named fields as columns, similar to a database row.
17///
18/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation
19/// of structured data where each row contains multiple named fields of potentially different types.
20///
21/// ## Data Layout
22///
23/// The struct array uses a columnar layout where:
24/// - Each field is stored as a separate child array
25/// - All fields must have the same length (number of rows)
26/// - Field names and types are defined in the struct's dtype
27/// - An optional validity mask indicates which entire rows are null
28///
29/// ## Row-level nulls
30///
31/// The StructArray contains its own top-level nulls, which are superimposed on top of the
32/// field-level validity values. This can be the case even if the fields themselves are non-nullable,
33/// accessing a particular row can yield nulls even if all children are valid at that position.
34///
35/// ```
36/// use vortex_array::arrays::{StructArray, BoolArray};
37/// use vortex_array::validity::Validity;
38/// use vortex_array::IntoArray;
39/// use vortex_dtype::FieldNames;
40/// use vortex_buffer::buffer;
41///
42/// // Create struct with all non-null fields but struct-level nulls
43/// let struct_array = StructArray::try_new(
44///     FieldNames::from(["a", "b", "c"]),
45///     vec![
46///         buffer![1i32, 2i32].into_array(),  // non-null field a
47///         buffer![10i32, 20i32].into_array(), // non-null field b
48///         buffer![100i32, 200i32].into_array(), // non-null field c
49///     ],
50///     2,
51///     Validity::Array(BoolArray::from_iter([true, false]).into_array()), // row 1 is null
52/// ).unwrap();
53///
54/// // Row 0 is valid - returns a struct scalar with field values
55/// let row0 = struct_array.scalar_at(0);
56/// assert!(!row0.is_null());
57///
58/// // Row 1 is null at struct level - returns null even though fields have values
59/// let row1 = struct_array.scalar_at(1);
60/// assert!(row1.is_null());
61/// ```
62///
63/// ## Name uniqueness
64///
65/// It is valid for a StructArray to have multiple child columns that have the same name. In this
66/// case, any accessors that use column names will find the first column in sequence with the name.
67///
68/// ```
69/// use vortex_array::arrays::StructArray;
70/// use vortex_array::validity::Validity;
71/// use vortex_array::IntoArray;
72/// use vortex_dtype::FieldNames;
73/// use vortex_buffer::buffer;
74///
75/// // Create struct with duplicate "data" field names
76/// let struct_array = StructArray::try_new(
77///     FieldNames::from(["data", "data"]),
78///     vec![
79///         buffer![1i32, 2i32].into_array(),   // first "data"
80///         buffer![3i32, 4i32].into_array(),   // second "data"
81///     ],
82///     2,
83///     Validity::NonNullable,
84/// ).unwrap();
85///
86/// // field_by_name returns the FIRST "data" field
87/// let first_data = struct_array.field_by_name("data").unwrap();
88/// assert_eq!(first_data.scalar_at(0), 1i32.into());
89/// ```
90///
91/// ## Field Operations
92///
93/// Struct arrays support efficient column operations:
94/// - **Projection**: Select/reorder fields without copying data
95/// - **Field access**: Get columns by name or index
96/// - **Column addition**: Add new fields to create extended structs
97/// - **Column removal**: Remove fields to create narrower structs
98///
99/// ## Validity Semantics
100///
101/// - Row-level nulls are tracked in the struct's validity child
102/// - Individual field nulls are tracked in each field's own validity
103/// - A null struct row means all fields in that row are conceptually null
104/// - Field-level nulls can exist independently of struct-level nulls
105///
106/// # Examples
107///
108/// ```
109/// use vortex_array::arrays::{StructArray, PrimitiveArray};
110/// use vortex_array::validity::Validity;
111/// use vortex_array::IntoArray;
112/// use vortex_dtype::FieldNames;
113/// use vortex_buffer::buffer;
114///
115/// // Create arrays for each field
116/// let ids = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable);
117/// let names = PrimitiveArray::new(buffer![100u64, 200, 300], Validity::NonNullable);
118///
119/// // Create struct array with named fields
120/// let struct_array = StructArray::try_new(
121///     FieldNames::from(["id", "score"]),
122///     vec![ids.into_array(), names.into_array()],
123///     3,
124///     Validity::NonNullable,
125/// ).unwrap();
126///
127/// assert_eq!(struct_array.len(), 3);
128/// assert_eq!(struct_array.names().len(), 2);
129///
130/// // Access field by name
131/// let id_field = struct_array.field_by_name("id").unwrap();
132/// assert_eq!(id_field.len(), 3);
133/// ```
134#[derive(Clone, Debug)]
135pub struct StructArray {
136    pub(super) len: usize,
137    pub(super) dtype: DType,
138    pub(super) fields: Arc<[ArrayRef]>,
139    pub(super) validity: Validity,
140    pub(super) stats_set: ArrayStats,
141}
142
143impl StructArray {
144    pub fn fields(&self) -> &Arc<[ArrayRef]> {
145        &self.fields
146    }
147
148    pub fn field_by_name(&self, name: impl AsRef<str>) -> VortexResult<&ArrayRef> {
149        let name = name.as_ref();
150        self.field_by_name_opt(name).ok_or_else(|| {
151            vortex_err!(
152                "Field {name} not found in struct array with names {:?}",
153                self.names()
154            )
155        })
156    }
157
158    pub fn field_by_name_opt(&self, name: impl AsRef<str>) -> Option<&ArrayRef> {
159        let name = name.as_ref();
160        self.struct_fields().find(name).map(|idx| &self.fields[idx])
161    }
162
163    pub fn names(&self) -> &FieldNames {
164        self.struct_fields().names()
165    }
166
167    pub fn struct_fields(&self) -> &StructFields {
168        let Some(struct_dtype) = &self.dtype.as_struct_fields_opt() else {
169            unreachable!(
170                "struct arrays must have be a DType::Struct, this is likely an internal bug."
171            )
172        };
173        struct_dtype
174    }
175
176    /// Create a new `StructArray` with the given length, but without any fields.
177    pub fn new_fieldless_with_len(len: usize) -> Self {
178        Self::try_new(
179            FieldNames::default(),
180            Vec::new(),
181            len,
182            Validity::NonNullable,
183        )
184        .vortex_expect("StructArray::new_with_len should not fail")
185    }
186
187    /// Creates a new [`StructArray`].
188    ///
189    /// # Panics
190    ///
191    /// Panics if the provided components do not satisfy the invariants documented
192    /// in [`StructArray::new_unchecked`].
193    pub fn new(
194        names: FieldNames,
195        fields: impl Into<Arc<[ArrayRef]>>,
196        length: usize,
197        validity: Validity,
198    ) -> Self {
199        Self::try_new(names, fields, length, validity)
200            .vortex_expect("StructArray construction failed")
201    }
202
203    /// Constructs a new `StructArray`.
204    ///
205    /// See [`StructArray::new_unchecked`] for more information.
206    ///
207    /// # Errors
208    ///
209    /// Returns an error if the provided components do not satisfy the invariants documented in
210    /// [`StructArray::new_unchecked`].
211    pub fn try_new(
212        names: FieldNames,
213        fields: impl Into<Arc<[ArrayRef]>>,
214        length: usize,
215        validity: Validity,
216    ) -> VortexResult<Self> {
217        let fields = fields.into();
218        let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype()).cloned().collect();
219        let dtype = StructFields::new(names, field_dtypes);
220
221        Self::validate(&fields, &dtype, length, &validity)?;
222
223        // SAFETY: validate ensures all invariants are met.
224        Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) })
225    }
226
227    /// Creates a new [`StructArray`] without validation from these components:
228    ///
229    /// * `fields` is a vector of arrays, one for each field in the struct.
230    /// * `dtype` contains the field names and types.
231    /// * `length` is the number of struct rows.
232    /// * `validity` holds the null values.
233    ///
234    /// # Safety
235    ///
236    /// The caller must ensure all of the following invariants are satisfied:
237    ///
238    /// ## Field Requirements
239    ///
240    /// - `fields.len()` must exactly equal `dtype.names().len()`.
241    /// - Every field array in `fields` must have length exactly equal to `length`.
242    /// - For each index `i`, `fields[i].dtype()` must exactly match `dtype.fields()[i]`.
243    ///
244    /// ## Type Requirements
245    ///
246    /// - Field names in `dtype` may be duplicated (this is explicitly allowed).
247    /// - The nullability of `dtype` must match the nullability of `validity`.
248    ///
249    /// ## Validity Requirements
250    ///
251    /// - If `validity` is [`Validity::Array`], its length must exactly equal `length`.
252    pub unsafe fn new_unchecked(
253        fields: impl Into<Arc<[ArrayRef]>>,
254        dtype: StructFields,
255        length: usize,
256        validity: Validity,
257    ) -> Self {
258        let fields = fields.into();
259
260        #[cfg(debug_assertions)]
261        Self::validate(&fields, &dtype, length, &validity)
262            .vortex_expect("[Debug Assertion]: Invalid `StructArray` parameters");
263
264        Self {
265            len: length,
266            dtype: DType::Struct(dtype, validity.nullability()),
267            fields,
268            validity,
269            stats_set: Default::default(),
270        }
271    }
272
273    /// Validates the components that would be used to create a [`StructArray`].
274    ///
275    /// This function checks all the invariants required by [`StructArray::new_unchecked`].
276    pub fn validate(
277        fields: &[ArrayRef],
278        dtype: &StructFields,
279        length: usize,
280        validity: &Validity,
281    ) -> VortexResult<()> {
282        // Check field count matches
283        if fields.len() != dtype.names().len() {
284            vortex_bail!(
285                "Got {} fields but dtype has {} names",
286                fields.len(),
287                dtype.names().len()
288            );
289        }
290
291        // Check each field's length and dtype
292        for (i, (field, struct_dt)) in fields.iter().zip(dtype.fields()).enumerate() {
293            if field.len() != length {
294                vortex_bail!(
295                    "Field {} has length {} but expected {}",
296                    i,
297                    field.len(),
298                    length
299                );
300            }
301
302            if field.dtype() != &struct_dt {
303                vortex_bail!(
304                    "Field {} has dtype {} but expected {}",
305                    i,
306                    field.dtype(),
307                    struct_dt
308                );
309            }
310        }
311
312        // Check validity length
313        if let Some(validity_len) = validity.maybe_len()
314            && validity_len != length
315        {
316            vortex_bail!(
317                "Validity has length {} but expected {}",
318                validity_len,
319                length
320            );
321        }
322
323        Ok(())
324    }
325
326    pub fn try_new_with_dtype(
327        fields: impl Into<Arc<[ArrayRef]>>,
328        dtype: StructFields,
329        length: usize,
330        validity: Validity,
331    ) -> VortexResult<Self> {
332        let fields = fields.into();
333        Self::validate(&fields, &dtype, length, &validity)?;
334
335        // SAFETY: validate ensures all invariants are met.
336        Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) })
337    }
338
339    pub fn from_fields<N: AsRef<str>>(items: &[(N, ArrayRef)]) -> VortexResult<Self> {
340        Self::try_from_iter(items.iter().map(|(a, b)| (a, b.to_array())))
341    }
342
343    pub fn try_from_iter_with_validity<
344        N: AsRef<str>,
345        A: IntoArray,
346        T: IntoIterator<Item = (N, A)>,
347    >(
348        iter: T,
349        validity: Validity,
350    ) -> VortexResult<Self> {
351        let (names, fields): (Vec<FieldName>, Vec<ArrayRef>) = iter
352            .into_iter()
353            .map(|(name, fields)| (FieldName::from(name.as_ref()), fields.into_array()))
354            .unzip();
355        let len = fields
356            .first()
357            .map(|f| f.len())
358            .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?;
359
360        Self::try_new(FieldNames::from_iter(names), fields, len, validity)
361    }
362
363    pub fn try_from_iter<N: AsRef<str>, A: IntoArray, T: IntoIterator<Item = (N, A)>>(
364        iter: T,
365    ) -> VortexResult<Self> {
366        Self::try_from_iter_with_validity(iter, Validity::NonNullable)
367    }
368
369    // TODO(aduffy): Add equivalent function to support field masks for nested column access.
370    /// Return a new StructArray with the given projection applied.
371    ///
372    /// Projection does not copy data arrays. Projection is defined by an ordinal array slice
373    /// which specifies the new ordering of columns in the struct. The projection can be used to
374    /// perform column re-ordering, deletion, or duplication at a logical level, without any data
375    /// copying.
376    #[allow(clippy::same_name_method)]
377    pub fn project(&self, projection: &[FieldName]) -> VortexResult<Self> {
378        let mut children = Vec::with_capacity(projection.len());
379        let mut names = Vec::with_capacity(projection.len());
380
381        let fields = self.fields();
382        for f_name in projection.iter() {
383            let idx = self
384                .names()
385                .iter()
386                .position(|name| name == f_name)
387                .ok_or_else(|| vortex_err!("Unknown field {f_name}"))?;
388
389            names.push(self.names()[idx].clone());
390            children.push(fields[idx].clone());
391        }
392
393        StructArray::try_new(
394            FieldNames::from(names.as_slice()),
395            children,
396            self.len(),
397            self.validity().clone(),
398        )
399    }
400
401    /// Removes and returns a column from the struct array by name.
402    /// If the column does not exist, returns `None`.
403    pub fn remove_column(&mut self, name: impl Into<FieldName>) -> Option<ArrayRef> {
404        let name = name.into();
405
406        let struct_dtype = self.struct_fields().clone();
407
408        let position = struct_dtype
409            .names()
410            .iter()
411            .position(|field_name| field_name.as_ref() == name.as_ref())?;
412
413        let field = self.fields[position].clone();
414        let new_fields: Arc<[ArrayRef]> = self
415            .fields
416            .iter()
417            .enumerate()
418            .filter(|(i, _)| *i != position)
419            .map(|(_, f)| f.clone())
420            .collect();
421
422        if let Ok(new_dtype) = struct_dtype.without_field(position) {
423            self.fields = new_fields;
424            self.dtype = DType::Struct(new_dtype, self.dtype.nullability());
425            return Some(field);
426        }
427        None
428    }
429
430    /// Create a new StructArray by appending a new column onto the existing array.
431    pub fn with_column(&self, name: impl Into<FieldName>, array: ArrayRef) -> VortexResult<Self> {
432        let name = name.into();
433        let struct_dtype = self.struct_fields().clone();
434
435        let names = struct_dtype.names().iter().cloned().chain(once(name));
436        let types = struct_dtype.fields().chain(once(array.dtype().clone()));
437        let new_fields = StructFields::new(names.collect(), types.collect());
438
439        let children: Arc<[ArrayRef]> = self.fields.iter().cloned().chain(once(array)).collect();
440
441        Self::try_new_with_dtype(children, new_fields, self.len, self.validity.clone())
442    }
443}