Skip to main content

vortex_array/arrays/struct_/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5use std::iter::once;
6use std::sync::Arc;
7
8use vortex_dtype::DType;
9use vortex_dtype::FieldName;
10use vortex_dtype::FieldNames;
11use vortex_dtype::StructFields;
12use vortex_error::VortexExpect;
13use vortex_error::VortexResult;
14use vortex_error::vortex_bail;
15use vortex_error::vortex_err;
16
17use crate::Array;
18use crate::ArrayRef;
19use crate::IntoArray;
20use crate::stats::ArrayStats;
21use crate::validity::Validity;
22use crate::vtable::ValidityHelper;
23
24/// A struct array that stores multiple named fields as columns, similar to a database row.
25///
26/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation
27/// of structured data where each row contains multiple named fields of potentially different types.
28///
29/// ## Data Layout
30///
31/// The struct array uses a columnar layout where:
32/// - Each field is stored as a separate child array
33/// - All fields must have the same length (number of rows)
34/// - Field names and types are defined in the struct's dtype
35/// - An optional validity mask indicates which entire rows are null
36///
37/// ## Row-level nulls
38///
39/// The StructArray contains its own top-level nulls, which are superimposed on top of the
40/// field-level validity values. This can be the case even if the fields themselves are non-nullable,
41/// accessing a particular row can yield nulls even if all children are valid at that position.
42///
43/// ```
44/// use vortex_array::arrays::{StructArray, BoolArray};
45/// use vortex_array::validity::Validity;
46/// use vortex_array::IntoArray;
47/// use vortex_dtype::FieldNames;
48/// use vortex_buffer::buffer;
49///
50/// // Create struct with all non-null fields but struct-level nulls
51/// let struct_array = StructArray::try_new(
52///     FieldNames::from(["a", "b", "c"]),
53///     vec![
54///         buffer![1i32, 2i32].into_array(),  // non-null field a
55///         buffer![10i32, 20i32].into_array(), // non-null field b
56///         buffer![100i32, 200i32].into_array(), // non-null field c
57///     ],
58///     2,
59///     Validity::Array(BoolArray::from_iter([true, false]).into_array()), // row 1 is null
60/// ).unwrap();
61///
62/// // Row 0 is valid - returns a struct scalar with field values
63/// let row0 = struct_array.scalar_at(0).unwrap();
64/// assert!(!row0.is_null());
65///
66/// // Row 1 is null at struct level - returns null even though fields have values
67/// let row1 = struct_array.scalar_at(1).unwrap();
68/// assert!(row1.is_null());
69/// ```
70///
71/// ## Name uniqueness
72///
73/// It is valid for a StructArray to have multiple child columns that have the same name. In this
74/// case, any accessors that use column names will find the first column in sequence with the name.
75///
76/// ```
77/// use vortex_array::arrays::StructArray;
78/// use vortex_array::validity::Validity;
79/// use vortex_array::IntoArray;
80/// use vortex_dtype::FieldNames;
81/// use vortex_buffer::buffer;
82///
83/// // Create struct with duplicate "data" field names
84/// let struct_array = StructArray::try_new(
85///     FieldNames::from(["data", "data"]),
86///     vec![
87///         buffer![1i32, 2i32].into_array(),   // first "data"
88///         buffer![3i32, 4i32].into_array(),   // second "data"
89///     ],
90///     2,
91///     Validity::NonNullable,
92/// ).unwrap();
93///
94/// // field_by_name returns the FIRST "data" field
95/// let first_data = struct_array.unmasked_field_by_name("data").unwrap();
96/// assert_eq!(first_data.scalar_at(0).unwrap(), 1i32.into());
97/// ```
98///
99/// ## Field Operations
100///
101/// Struct arrays support efficient column operations:
102/// - **Projection**: Select/reorder fields without copying data
103/// - **Field access**: Get columns by name or index
104/// - **Column addition**: Add new fields to create extended structs
105/// - **Column removal**: Remove fields to create narrower structs
106///
107/// ## Validity Semantics
108///
109/// - Row-level nulls are tracked in the struct's validity child
110/// - Individual field nulls are tracked in each field's own validity
111/// - A null struct row means all fields in that row are conceptually null
112/// - Field-level nulls can exist independently of struct-level nulls
113///
114/// # Examples
115///
116/// ```
117/// use vortex_array::arrays::{StructArray, PrimitiveArray};
118/// use vortex_array::validity::Validity;
119/// use vortex_array::IntoArray;
120/// use vortex_dtype::FieldNames;
121/// use vortex_buffer::buffer;
122///
123/// // Create arrays for each field
124/// let ids = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable);
125/// let names = PrimitiveArray::new(buffer![100u64, 200, 300], Validity::NonNullable);
126///
127/// // Create struct array with named fields
128/// let struct_array = StructArray::try_new(
129///     FieldNames::from(["id", "score"]),
130///     vec![ids.into_array(), names.into_array()],
131///     3,
132///     Validity::NonNullable,
133/// ).unwrap();
134///
135/// assert_eq!(struct_array.len(), 3);
136/// assert_eq!(struct_array.names().len(), 2);
137///
138/// // Access field by name
139/// let id_field = struct_array.unmasked_field_by_name("id").unwrap();
140/// assert_eq!(id_field.len(), 3);
141/// ```
142#[derive(Clone, Debug)]
143pub struct StructArray {
144    pub(super) len: usize,
145    pub(super) dtype: DType,
146    pub(super) fields: Arc<[ArrayRef]>,
147    pub(super) validity: Validity,
148    pub(super) stats_set: ArrayStats,
149}
150
151pub struct StructArrayParts {
152    pub struct_fields: StructFields,
153    pub fields: Arc<[ArrayRef]>,
154    pub validity: Validity,
155}
156
157impl StructArray {
158    /// Return the struct fields without the validity of the struct applied
159    pub fn unmasked_fields(&self) -> &Arc<[ArrayRef]> {
160        &self.fields
161    }
162
163    /// Return the struct field without the validity of the struct applied
164    pub fn unmasked_field_by_name(&self, name: impl AsRef<str>) -> VortexResult<&ArrayRef> {
165        let name = name.as_ref();
166        self.unmasked_field_by_name_opt(name).ok_or_else(|| {
167            vortex_err!(
168                "Field {name} not found in struct array with names {:?}",
169                self.names()
170            )
171        })
172    }
173
174    /// Return the struct field without the validity of the struct applied
175    pub fn unmasked_field_by_name_opt(&self, name: impl AsRef<str>) -> Option<&ArrayRef> {
176        let name = name.as_ref();
177        self.struct_fields().find(name).map(|idx| &self.fields[idx])
178    }
179
180    pub fn names(&self) -> &FieldNames {
181        self.struct_fields().names()
182    }
183
184    pub fn struct_fields(&self) -> &StructFields {
185        let Some(struct_dtype) = &self.dtype.as_struct_fields_opt() else {
186            unreachable!(
187                "struct arrays must have be a DType::Struct, this is likely an internal bug."
188            )
189        };
190        struct_dtype
191    }
192
193    /// Create a new `StructArray` with the given length, but without any fields.
194    pub fn new_fieldless_with_len(len: usize) -> Self {
195        Self::try_new(
196            FieldNames::default(),
197            Vec::new(),
198            len,
199            Validity::NonNullable,
200        )
201        .vortex_expect("StructArray::new_with_len should not fail")
202    }
203
204    /// Creates a new [`StructArray`].
205    ///
206    /// # Panics
207    ///
208    /// Panics if the provided components do not satisfy the invariants documented
209    /// in [`StructArray::new_unchecked`].
210    pub fn new(
211        names: FieldNames,
212        fields: impl Into<Arc<[ArrayRef]>>,
213        length: usize,
214        validity: Validity,
215    ) -> Self {
216        Self::try_new(names, fields, length, validity)
217            .vortex_expect("StructArray construction failed")
218    }
219
220    /// Constructs a new `StructArray`.
221    ///
222    /// See [`StructArray::new_unchecked`] for more information.
223    ///
224    /// # Errors
225    ///
226    /// Returns an error if the provided components do not satisfy the invariants documented in
227    /// [`StructArray::new_unchecked`].
228    pub fn try_new(
229        names: FieldNames,
230        fields: impl Into<Arc<[ArrayRef]>>,
231        length: usize,
232        validity: Validity,
233    ) -> VortexResult<Self> {
234        let fields = fields.into();
235        let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype()).cloned().collect();
236        let dtype = StructFields::new(names, field_dtypes);
237
238        Self::validate(&fields, &dtype, length, &validity)?;
239
240        // SAFETY: validate ensures all invariants are met.
241        Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) })
242    }
243
244    /// Creates a new [`StructArray`] without validation from these components:
245    ///
246    /// * `fields` is a vector of arrays, one for each field in the struct.
247    /// * `dtype` contains the field names and types.
248    /// * `length` is the number of struct rows.
249    /// * `validity` holds the null values.
250    ///
251    /// # Safety
252    ///
253    /// The caller must ensure all of the following invariants are satisfied:
254    ///
255    /// ## Field Requirements
256    ///
257    /// - `fields.len()` must exactly equal `dtype.names().len()`.
258    /// - Every field array in `fields` must have length exactly equal to `length`.
259    /// - For each index `i`, `fields[i].dtype()` must exactly match `dtype.fields()[i]`.
260    ///
261    /// ## Type Requirements
262    ///
263    /// - Field names in `dtype` may be duplicated (this is explicitly allowed).
264    /// - The nullability of `dtype` must match the nullability of `validity`.
265    ///
266    /// ## Validity Requirements
267    ///
268    /// - If `validity` is [`Validity::Array`], its length must exactly equal `length`.
269    pub unsafe fn new_unchecked(
270        fields: impl Into<Arc<[ArrayRef]>>,
271        dtype: StructFields,
272        length: usize,
273        validity: Validity,
274    ) -> Self {
275        let fields = fields.into();
276
277        #[cfg(debug_assertions)]
278        Self::validate(&fields, &dtype, length, &validity)
279            .vortex_expect("[Debug Assertion]: Invalid `StructArray` parameters");
280
281        Self {
282            len: length,
283            dtype: DType::Struct(dtype, validity.nullability()),
284            fields,
285            validity,
286            stats_set: Default::default(),
287        }
288    }
289
290    /// Validates the components that would be used to create a [`StructArray`].
291    ///
292    /// This function checks all the invariants required by [`StructArray::new_unchecked`].
293    pub fn validate(
294        fields: &[ArrayRef],
295        dtype: &StructFields,
296        length: usize,
297        validity: &Validity,
298    ) -> VortexResult<()> {
299        // Check field count matches
300        if fields.len() != dtype.names().len() {
301            vortex_bail!(
302                InvalidArgument: "Got {} fields but dtype has {} names",
303                fields.len(),
304                dtype.names().len()
305            );
306        }
307
308        // Check each field's length and dtype
309        for (i, (field, struct_dt)) in fields.iter().zip(dtype.fields()).enumerate() {
310            if field.len() != length {
311                vortex_bail!(
312                    InvalidArgument: "Field {} has length {} but expected {}",
313                    i,
314                    field.len(),
315                    length
316                );
317            }
318
319            if field.dtype() != &struct_dt {
320                vortex_bail!(
321                    InvalidArgument: "Field {} has dtype {} but expected {}",
322                    i,
323                    field.dtype(),
324                    struct_dt
325                );
326            }
327        }
328
329        // Check validity length
330        if let Some(validity_len) = validity.maybe_len()
331            && validity_len != length
332        {
333            vortex_bail!(
334                InvalidArgument: "Validity has length {} but expected {}",
335                validity_len,
336                length
337            );
338        }
339
340        Ok(())
341    }
342
343    pub fn try_new_with_dtype(
344        fields: impl Into<Arc<[ArrayRef]>>,
345        dtype: StructFields,
346        length: usize,
347        validity: Validity,
348    ) -> VortexResult<Self> {
349        let fields = fields.into();
350        Self::validate(&fields, &dtype, length, &validity)?;
351
352        // SAFETY: validate ensures all invariants are met.
353        Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) })
354    }
355
356    pub fn into_parts(self) -> StructArrayParts {
357        let struct_fields = self.dtype.into_struct_fields();
358        StructArrayParts {
359            struct_fields,
360            fields: self.fields,
361            validity: self.validity,
362        }
363    }
364
365    pub fn into_fields(self) -> Vec<ArrayRef> {
366        self.into_parts().fields.to_vec()
367    }
368
369    pub fn from_fields<N: AsRef<str>>(items: &[(N, ArrayRef)]) -> VortexResult<Self> {
370        Self::try_from_iter(items.iter().map(|(a, b)| (a, b.to_array())))
371    }
372
373    pub fn try_from_iter_with_validity<
374        N: AsRef<str>,
375        A: IntoArray,
376        T: IntoIterator<Item = (N, A)>,
377    >(
378        iter: T,
379        validity: Validity,
380    ) -> VortexResult<Self> {
381        let (names, fields): (Vec<FieldName>, Vec<ArrayRef>) = iter
382            .into_iter()
383            .map(|(name, fields)| (FieldName::from(name.as_ref()), fields.into_array()))
384            .unzip();
385        let len = fields
386            .first()
387            .map(|f| f.len())
388            .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?;
389
390        Self::try_new(FieldNames::from_iter(names), fields, len, validity)
391    }
392
393    pub fn try_from_iter<N: AsRef<str>, A: IntoArray, T: IntoIterator<Item = (N, A)>>(
394        iter: T,
395    ) -> VortexResult<Self> {
396        Self::try_from_iter_with_validity(iter, Validity::NonNullable)
397    }
398
399    // TODO(aduffy): Add equivalent function to support field masks for nested column access.
400    /// Return a new StructArray with the given projection applied.
401    ///
402    /// Projection does not copy data arrays. Projection is defined by an ordinal array slice
403    /// which specifies the new ordering of columns in the struct. The projection can be used to
404    /// perform column re-ordering, deletion, or duplication at a logical level, without any data
405    /// copying.
406    pub fn project(&self, projection: &[FieldName]) -> VortexResult<Self> {
407        let mut children = Vec::with_capacity(projection.len());
408        let mut names = Vec::with_capacity(projection.len());
409
410        let fields = self.unmasked_fields();
411        for f_name in projection.iter() {
412            let idx = self
413                .names()
414                .iter()
415                .position(|name| name == f_name)
416                .ok_or_else(|| vortex_err!("Unknown field {f_name}"))?;
417
418            names.push(self.names()[idx].clone());
419            children.push(fields[idx].clone());
420        }
421
422        StructArray::try_new(
423            FieldNames::from(names.as_slice()),
424            children,
425            self.len(),
426            self.validity().clone(),
427        )
428    }
429
430    /// Removes and returns a column from the struct array by name.
431    /// If the column does not exist, returns `None`.
432    pub fn remove_column(&mut self, name: impl Into<FieldName>) -> Option<ArrayRef> {
433        let name = name.into();
434
435        let struct_dtype = self.struct_fields().clone();
436
437        let position = struct_dtype
438            .names()
439            .iter()
440            .position(|field_name| field_name.as_ref() == name.as_ref())?;
441
442        let field = self.fields[position].clone();
443        let new_fields: Arc<[ArrayRef]> = self
444            .fields
445            .iter()
446            .enumerate()
447            .filter(|(i, _)| *i != position)
448            .map(|(_, f)| f.clone())
449            .collect();
450
451        if let Ok(new_dtype) = struct_dtype.without_field(position) {
452            self.fields = new_fields;
453            self.dtype = DType::Struct(new_dtype, self.dtype.nullability());
454            return Some(field);
455        }
456        None
457    }
458
459    /// Create a new StructArray by appending a new column onto the existing array.
460    pub fn with_column(&self, name: impl Into<FieldName>, array: ArrayRef) -> VortexResult<Self> {
461        let name = name.into();
462        let struct_dtype = self.struct_fields().clone();
463
464        let names = struct_dtype.names().iter().cloned().chain(once(name));
465        let types = struct_dtype.fields().chain(once(array.dtype().clone()));
466        let new_fields = StructFields::new(names.collect(), types.collect());
467
468        let children: Arc<[ArrayRef]> = self.fields.iter().cloned().chain(once(array)).collect();
469
470        Self::try_new_with_dtype(children, new_fields, self.len, self.validity.clone())
471    }
472}